0.准备工作

1.相关教程

Python 爬虫系列教程：http://cuiqingcai.com/1052.html

Python Web课程：http://www.cnblogs.com/moonache/p/5110322.html

Python 中文参考文档：http://python.usyiyi.cn/

2.说明

下面的代码基本只处于可用阶段，欠缺移植性，本篇Bolg更多是一种记录

本篇Bolg中使用的是Python2.7

CPU信息从该网址获取：http://zj.zol.com.cn/

3.效果

1.获取CPU型号和主频信息

1.神伤的AJAX

本来想直接爬，结果发现http://zj.zol.com.cn/ 翻页后链接不变，通过chrome F12的控制台发现是通过AJAX刷新的

而且发现了 http://zj.zol.com.cn/index.php?c=Ajax_ParamResponse&a=GetGoods&subcateId=28&type=0&priceId=noPrice&page=3&manuId=¶mStr=&keyword=&locationId=1&queryType=0

我只需修改page=n 即可获取第n页的CPU信息

2.获取CPU名字

格式举列

tag.contents[0] :AMD \u7cfb\u5217 A8-7670\uff08\u76d2\u88c5\uff09<\/a>\r\n\t\t\t\t\t <\/h3>\r\n\t\t\t\t\t

manufacturer:AMD

modalDetail:A8-7670

modal:AMD A8-7670

#-*- coding: UTF-8 -*-

import urllib

import re

from bs4 import BeautifulSoup

url='http://zj.zol.com.cn/index.php?c=Ajax_ParamResponse&a=GetGoods&subcateId=28&type=0&priceId=noPrice&page=2&manuId=&paramStr=&keyword=&locationId=1&queryType=0'

html = urllib.urlopen(url).read()

soup=BeautifulSoup(html,"html.parser")

listModal=[]

listSpecs=[]

tags = soup.find_all("a",attrs={"target":"\\\"_blank\\\""})

cnt=0

for tag in tags:

    cnt+=1

    modalSubstr=tag.contents[0]

    #print 'modalSubstr:'+modalSubstr

    manufacturer=re.findall('(.+?) ',modalSubstr)[0]#非贪心匹配 遇到空格即中止，返回第一个匹配项

    #print 'manufacturer:'+manufacturer

    detailSubstr=re.findall(' ([0-9a-zA-Z- ]+)',modalSubstr)

    #print detailSubstr

    detailSubstr0=detailSubstr[0]

    #针对i3、i5、i7的处理

    if "i3" in modalSubstr:

        modalDetail="i3 "+detailSubstr0

    elif "i5" in modalSubstr:

        modalDetail="i5 "+detailSubstr0

    elif "i7" in modalSubstr:

        modalDetail="i7 "+detailSubstr0

    else:

        modalDetail=detailSubstr0

    #针对APU的处理

    if modalDetail=="APU":

        modalDetail+=" "+detailSubstr[1]

    modal=manufacturer+" "+modalDetail

    print "modal:"+modal

效果

3.获取CPU主频

except IndexError:因为中关村网站上最后一款CPU的主频信息暂无，所以针对这个情况它的规格（specs）为“Data Missed”

#-*- coding: UTF-8 -*-

import urllib

import re

from bs4 import BeautifulSoup

url='http://zj.zol.com.cn/index.php?c=Ajax_ParamResponse&a=GetGoods&subcateId=28&type=0&priceId=noPrice&page=2&manuId=&paramStr=&keyword=&locationId=1&queryType=0'

html = urllib.urlopen(url).read()

soup=BeautifulSoup(html,"html.parser")

listModal=[]

listSpecs=[]

tags = soup.find_all("a",attrs={"target":"\\\"_blank\\\""})

cnt=0

for tag in tags:

    cnt+=1

    print cnt

    substr=str(tag)[100:500]

    #以title='\"开头+任意小数+ GHz结尾

    specsDictionary=re.findall(r'title=\'\\\"([0-9.]+GHz)',substr)

    try:

        specs=specsDictionary[0]

    except IndexError:

        specs="Data Missed"

    print specs

效果

4.循环读取下一页并自动终止

一共有16页，本来可以直接用循环，但经观察发现每页开头的内容中有page值。而且当地址中的page>=16，index.php都只会返回page=16的内容。所以有了下面的代码用来循环读取下一页并自动终止。

urlLeft='http://zj.zol.com.cn/index.php?c=Ajax_ParamResponse&a=GetGoods&subcateId=28&type=0&priceId=noPrice&page='

urlRight='&manuId=&paramStr=&keyword=&locationId=1&queryType=0'

urlPageIndex=1

while (1):

    url=urlLeft+str(urlPageIndex)+urlRight

    html = urllib.urlopen(url).read()

    soup=BeautifulSoup(html,"html.parser")

    soupSub=str(soup)[0:50]

    pageIndex=int(re.findall('page\":([0-9]+)',soupSub)[0])

    if urlPageIndex==pageIndex:

        tags = soup.find_all("a",attrs={"target":"\\\"_blank\\\""})

        cnt=0

        for tag in tags:

            ......省略

        print "yes"+str(urlPageIndex)

        urlPageIndex+=1

    else:

        print "no"+str(urlPageIndex)

        break

5.输出为csv

python内置了csv读取和导入，我参考crifan上的的csv导出

import csv

with open('excel_2010_ms-dos.csv', 'rb') as csvfile:

    spamreader = csv.reader(csvfile, dialect='excel')

    for row in spamreader:

        print ', '.join(row)

6.最终代码

#-*- coding: UTF-8 -*-

import urllib

import re

import csv

from bs4 import BeautifulSoup

listModal=[]

listSpecs=[]

urlLeft='http://zj.zol.com.cn/index.php?c=Ajax_ParamResponse&a=GetGoods&subcateId=28&type=0&priceId=noPrice&page='

urlRight='&manuId=&paramStr=&keyword=&locationId=1&queryType=0'

urlPageIndex=1

while (1):

    url=urlLeft+str(urlPageIndex)+urlRight

    html = urllib.urlopen(url).read()

    soup=BeautifulSoup(html,"html.parser")

    soupSub=str(soup)[0:50]

    pageIndex=int(re.findall('page\":([0-9]+)',soupSub)[0])

    if urlPageIndex==pageIndex:

        tags = soup.find_all("a",attrs={"target":"\\\"_blank\\\""})

        cnt=0

        for tag in tags:

            cnt+=1

            modalSubstr=tag.contents[0]

            manufacturer=re.findall('(.+?) ',modalSubstr)[0]#非贪心匹配 遇到空格即中止，返回第一个匹配项

            detailSubstr=re.findall(' ([0-9a-zA-Z- ]+)',modalSubstr)

            detailSubstr0=detailSubstr[0]

            #针对i3、i5、i7的处理

            if "i3" in modalSubstr:

                modalDetail="i3 "+detailSubstr0

            elif "i5" in modalSubstr:

                modalDetail="i5 "+detailSubstr0

            elif "i7" in modalSubstr:

                modalDetail="i7 "+detailSubstr0

            else:

                modalDetail=detailSubstr0

            #针对APU的处理

            if modalDetail=="APU":

                modalDetail+=" "+detailSubstr[1]

            modal=manufacturer+" "+modalDetail

            listModal.append(modal)

            substr=str(tag)[100:500]

            #以title='\"开头+任意小数+ GHz结尾

            specsDictionary=re.findall(r'title=\'\\\"([0-9.]+GHz)',substr)

            try:

                specs=specsDictionary[0]

            except IndexError:

                specs="Data Missed"

            listSpecs.append(specs)

        print "yes"+str(urlPageIndex)

        urlPageIndex+=1

    else:

        print "no"+str(urlPageIndex)

        break

with open('Config.csv', 'wb') as csvfile:

    spamwriter = csv.writer(csvfile, dialect='excel')

    #write 标题行

    spamwriter.writerow(['Config_Type','Config_Modal','Config_Specs','Config_MinorSpecs'])

    i=0

    for elementModal in listModal:

        spamwriter.writerow(['CPU',listModal[i], listSpecs[i]])

        i+=1

来自为知笔记(Wiz)

Python 爬取中关村CPU名字和主频的更多相关文章

利用python爬取58同城简历数据
利用python爬取58同城简历数据利用python爬取58同城简历数据最近接到一个工作,需要获取58同城上面的简历信息(http://gz.58.com/qzyewu/).最开始想到是用pyth ...
利用Python爬取豆瓣电影
目标:使用Python爬取豆瓣电影并保存MongoDB数据库中我们先来看一下通过浏览器的方式来筛选某些特定的电影: 我们把URL来复制出来分析分析: https://movie.douban.com ...
Python爬取LOL英雄皮肤
Python爬取LOL英雄皮肤 Python 爬虫一实现分析在官网上找到英雄皮肤的真实链接,查看多个后发现前缀相同,后面对应为英雄的ID和皮肤的ID,皮肤的ID从00开始顺序递增,而英雄ID跟 ...
萌新学习Python爬取B站弹幕+R语言分词demo说明
代码地址如下:http://www.demodashi.com/demo/11578.html 一.写在前面之前在简书首页看到了Python爬虫的介绍,于是就想着爬取B站弹幕并绘制词云,因此有了这样 ...
Python爬取网页信息
Python爬取网页信息的步骤以爬取英文名字网站(https://nameberry.com/)中每个名字的评论内容,包括英文名,用户名,评论的时间和评论的内容为例. 1.确认网址在浏览器中输入初 ...
steam夏日促销悄然开始，用Python爬取排行榜上的游戏打折信息
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 不知不觉,一年一度如火如荼的steam夏日促销悄然开始了.每年通过大大小小 ...
Python爬取 | 唯美女生图片
这里只是代码展示,且复制后不能直接运行,需要配置一些设置才行,具体请查看下方链接介绍: Python爬取 | 唯美女生图片 from selenium import webdriver from fa ...
Python爬取 | 王者荣耀英雄皮肤海报
这里只展示代码,具体介绍请点击下方链接. Python爬取 | 王者荣耀英雄皮肤海报 import requests import re import os import time import wi ...
Python 爬取途虎养车全系车型轮胎保养数据
Python 爬取途虎养车全系车型轮胎保养数据 2021.7.27 更新增加标题.发布时间参数 demo文末自行下载,需要完整数据私聊我 2021.2.19 更新增加大保养数据 2020. ...

随机推荐

[实例]JAVA生成字母+随机数字并生成文件
package com.ishow.control.code; import java.io.*; import java.text.SimpleDateFormat; import java.uti ...
java中的foreach用法及总结
增强for(part1:part2){part3}; part2中是一个数组对象,或者是带有泛性的集合. part1定义了一个局部变量,这个局部变量的类型与part2中的对象元素的类型是一致的. pa ...
Asp.net core Razor 页面
创建asp.net core 空项目->MyWeb 修改Startup.cs启动文件添加Razor页面支持: public void ConfigureServices(IServiceColl ...
Sublime3中如何安装markdown插件支持
参考文章 Sublime Text下使用markdown的环境搭建和配置 MarkDown生成目录索引按下键Ctrl+Shift+p调出命令面板,找到Package Control: install ...
奥酷HTML5视频直播系统AMS6.0
今日,北极星通自主研发的流媒体服务系统Aoku Media Server6.0发布了,将正式支持HTML5直播,这使得网页中无需有flash播放插件或者其他插件,可直接观看直播,HTML5直播也会使得 ...
解决CXF的java.io.FileNotFoundException: class path resource [META-INF/cxf/cxf-extension-soap.xml] cannot be opened because it does not exist
以下是错误信息九月 25, 2017 8:22:04 下午 org.springframework.web.context.support.XmlWebApplicationContext prep ...
os模块中关于文件/目录常用的函数使用方法
os模块中关于文件/目录常用的函数使用方法函数名使用方法 getcwd() 返回当前工作目录 chdir(path) 改变工作目录 listdir(path='.') 列举指定目录中的文件名('. ...
docker数据库
拉取镜像 # docker pull mysql: 创建docker数据库容器 # docker run -d --name mysql -p 3306:3306 -e MYSQL_ROOT_PASS ...
bzoj 2176 最小表示
2176: Strange string Time Limit: 10 Sec Memory Limit: 259 MBSubmit: 419 Solved: 174[Submit][Status ...
Ubuntu忘记root密码怎么办？
http://www.linuxidc.com/Linux/2016-05/131256.htm

Python 爬取 中关村CPU名字和主频