备注还没来得及写,共爬取八千多的歌手,每名歌手平均三十首歌曲算,大概二十多万首歌曲

run.py

 #!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'Zqf'
from dingdian_simple.spiders.dingdian_spider import DingdianSimpleSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings # 获取settings.py模块的设置
settings = get_project_settings()
process = CrawlerProcess(settings=settings) # 可以添加多个spider
process.crawl(DingdianSimpleSpider) # 启动爬虫,会阻塞,直到爬取完成
process.start()

kugou.py

 #!/usr/bin/env python
# -*- coding: utf-8 -*-
import re __author__ = 'Zqf' import scrapy
from kugoumusic.items import KugoumusicItem
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule class KugouSpiders(scrapy.spiders.CrawlSpider):
name = 'kugou' start_urls = ['http://www.kugou.com/'] rules = (
Rule(LinkExtractor(allow=['http://www.kugou.com/yy/html/singer.html',
'http://www.kugou.com/yy/singer/index/\d-([a-z]|null)-1.html'])),
Rule(LinkExtractor(allow=['http://www.kugou.com/yy/singer/home/\d+.html']), callback='parse_item')
) def parse_item(self, response):
singer = response.xpath('//div/div[@class="clear_fix"]/strong/text()').extract_first()
print(singer)
songs = response.xpath('//ul[@id="song_container"]/li//span[@class="text"]/i/text()').extract()
print(songs) item = KugoumusicItem()
item['singer'] = singer
item['songs'] = songs yield item

items.py

 # -*- coding: utf-8 -*-

 # Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html import scrapy class KugoumusicItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
singer = scrapy.Field()
songs = scrapy.Field()

pipelines.py

 # -*- coding: utf-8 -*-

 # Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient class KugoumusicPipeline(object): def open_spider(self, spider):
# mongo_config = spider.settings['MONGO_CONFIG']
# host = '127.0.0.1', port = 27017
self.client = MongoClient(host='127.0.0.1', port=27017)
self.coll = self.client['student_db']['kugou']
self.li = [] def close_spider(self, spider):
self.insert()
self.client.close() def insert(self):
self.coll.insert_many(self.li) def process_item(self, item, spider):
if len(self.li) >= 100:
self.insert()
self.li = []
print("成功插入100条数据-------------------------------------")
else:
self.li.append(dict(item)) return item

settings.py

 # -*- coding: utf-8 -*-

 # Scrapy settings for kugoumusic project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'kugoumusic' SPIDER_MODULES = ['kugoumusic.spiders']
NEWSPIDER_MODULE = 'kugoumusic.spiders' # MONGO_CONFIG = ['192.168.62.35:1806, '
# '192.168.62.240:1806, '
# '192.168.62.23:1806, '
# '192.168.62.32:1806, '
# '192.168.62.25:1806, '
# '192.168.62.28:1806, '
# '192.168.62.241:1806'] # MONGO_CONFIG = {
# 'host': '127.0.0.1',
# 'port': 27017
# 'user': 'root',
# 'password': '123456',
# 'db': 's1806',
# 'charset': 'utf8'
# }
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'kugoumusic (+http://www.yourdomain.com)' # Obey robots.txt rules
ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default)
#COOKIES_ENABLED = False # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False # Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
} # Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'kugoumusic.middlewares.KugoumusicSpiderMiddleware': 543,
#} # Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'kugoumusic.middlewares.KugoumusicDownloaderMiddleware': 543,
#} # Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#} # Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'kugoumusic.pipelines.KugoumusicPipeline': 300,
} # Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中的更多相关文章

  1. Java爬虫系列之实战:爬取酷狗音乐网 TOP500 的歌曲(附源码)

    在前面分享的两篇随笔中分别介绍了HttpClient和Jsoup以及简单的代码案例: Java爬虫系列二:使用HttpClient抓取页面HTML Java爬虫系列三:使用Jsoup解析HTML 今天 ...

  2. python爬取酷狗音乐排行榜

    本文为大家分享了python爬取酷狗音乐排行榜的具体代码,供大家参考,具体内容如下  

  3. python使用beautifulsoup4爬取酷狗音乐

    声明:本文仅为技术交流,请勿用于它处. 小编经常在网上听一些音乐但是有一些网站好多音乐都是付费下载的正好我会点爬虫技术,空闲时间写了一份,截止4月底没有问题的,会下载到当前目录,只要按照bs4库就好, ...

  4. python爬取酷狗音乐

    url:https://www.kugou.com/yy/html/rank.html 我们随便访问一个歌曲可以看到url有个hash https://www.kugou.com/song/#hash ...

  5. 【Python】【爬虫】爬取酷狗音乐网络红歌榜

    原理:我的上篇博客 import requests import time from bs4 import BeautifulSoup def get_html(url): ''' 获得 HTML ' ...

  6. Python爬取酷狗飙升榜前十首(100)首,写入CSV文件

    酷狗飙升榜,写入CSV文件 爬取酷狗音乐飙升榜的前十首歌名.歌手.时间,是一个很好的爬取网页内容的例子,对爬虫不熟悉的读者可以根据这个例子熟悉爬虫是如何爬取网页内容的. 需要用到的库:requests ...

  7. 【Python】【爬虫】爬取酷狗TOP500

    好啦好啦,那我们来拉开我们的爬虫之旅吧~~~ 这一只小爬虫是爬取酷狗TOP500的,使用的爬取手法简单粗暴,目的是帮大家初步窥探爬虫长啥样,后期会慢慢变得健壮起来的. 环境配置 在此之前需要下载一个谷 ...

  8. htmlunit+fastjson抓取酷狗音乐 qq音乐链接及下载

    上次学了jsoup之后,发现一些动态生成的网页内容是无法抓取的,于是又学习了htmlunit,下面是抓取酷狗音乐与qq音乐链接的例子: 酷狗音乐: import java.io.BufferedInp ...

  9. 使用Xpath爬取酷狗TOP500的歌曲信息

    使用xpath爬取酷狗TOP500的歌曲信息, 将排名.歌手名.歌曲名.歌曲时长,提取的结果以文件形式保存下来.参考网址:http://www.kugou.com/yy/rank/home/1-888 ...

随机推荐

  1. luence全文检索(数据库检索)

    注解:从数据库中查询所有数据然后放入luence中,然后在luence来检索 package com.zhu.demo; import java.io.IOException; import java ...

  2. CentOS 6.5下安装MySQL 5.6.21

    Linux中使用最广泛的数据库就是MySQL,使用在线yum的方式安装的版本落后MySQL网站好几个小版本,本节亲自测试安装新版的MySQL. 测试机器环境: VMware Workstation 1 ...

  3. hdoj--1379--DNA Sorting(排序水题)

     DNA Sorting Time Limit: 2000/1000 MS (Java/Others)    Memory Limit: 65536/32768 K (Java/Others) T ...

  4. KeepAlived的介绍

    KeepAlived介绍 keepalived keepalived是一个类似于layer3, 4 & 7交换机制的软件,也就是我们平时说的第3层.第4层和第7层交换. Keepalived的 ...

  5. Yii2-redis安装配置

    编辑composer.json文件 vim composer.json 加入"yiisoft/yii2-redis": "~2.0.0" 执行更新 compos ...

  6. E20171228-hm

    traverse  n. 穿过; 横贯,横切; 横木; [建] 横梁;               vt. 通过; 横越,横贯; [法] 否认,反驳; [木工] 横刨;

  7. 说说Charles

    本文来源 https://blog.csdn.net/Aaroun/article/details/79109917 今天,给大家做一次分享,主要面向移动端测试,介绍了我平时接口开发工作中用到的功能. ...

  8. Poj 3177 Redundant Paths (双连通分支+节点统计)

    题目描述: 给出一个无向的连通图,问最少加入几条边,才能使所给的图变为无桥的双连通图? 解题思路: 可以求出原图中所有的不包含桥的所有最大连通子图,然后对连通子图进行标记缩点,统计度为1的叶子节点le ...

  9. 递推DP HDOJ 5389 Zero Escape

    题目传送门 /* 题意:把N个数分成两组,一组加起来是A,一组加起来是B,1<=A,B<=9,也可以全分到同一组.其中加是按照他给的规则加,就是一位一位加,超过一位数了再拆分成一位一位加. ...

  10. EditText(5)如何控制输入键盘的显示方式及参数表

    参考: https://developer.android.com/training/keyboard-input/visibility.html 1,进入Activity时,EditText就获得焦 ...