爬取淘宝商品数据并保存在excel中

１.re实现

 import requests

 from requests.exceptions import RequestException

 import re,json

 import xlwt,xlrd

 # 数据

 DATA = []

 KEYWORD = 'python'

 HEADERS = {'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome'\

                         '/63.0.3239.132 Safari/537.36'}

 MAX_PAGE = 10

 def get_target(data_list):

     for item in data_list:

          temp = {

         'title': item['title'],

         'price': item['view_price'],

         'sales': item['view_sales'],

         'isTmall': '否' if float(item['view_fee']) else '是',

         'area': item['item_loc'],

         'name': item['nick'],

         'url': item['detail_url']

          }

          DATA.append(temp)

     return True

 # 发送http请求，获取网页源码

 def get_html(url,*args):

     try:

         if not args:

             response = requests.get(url,headers=HEADERS)

             global COOKIES

             COOKIES = response.cookies  # 获取cookie

         else:

             response = requests.get(url,headers=HEADERS,cookies=COOKIES)

         response.encoding = response.apparent_encoding

         return response.text

     except RequestException:

         print('请求源码出错！')

 # 解析源码，得到目标信息

 def parse_html(html,*args):

     if not args:

         pattern = re.compile(r'g_page_config = (.*?)g_srp_loadCss',re.S)

         # 去掉末尾的';'

         result = re.findall(pattern, html)[0].strip()[:-1]

         # 格式化json，可以用json在线解析工具查看结构

         content = json.loads(result)

         data_list = content['mods']['itemlist']['data']['auctions']

     else:

         pattern = re.compile(r'{.*}',re.S)

         result = re.findall(pattern,html)[0]

         content = json.loads(result)

         data_list = content['API.CustomizedApi']['itemlist']['auctions']

     get_target(data_list)

 def save_to_excel():

     f_name = '淘宝%s数据'%KEYWORD

     book = xlwt.Workbook(encoding='utf-8',style_compression=0)

     sheet = book.add_sheet(f_name)

     sheet.write(0, 0, 'title')

     sheet.write(0, 1, 'price')

     sheet.write(0, 2, 'sales')

     sheet.write(0, 3, 'isTmall')

     sheet.write(0, 4, 'area')

     sheet.write(0, 5, 'name')

     sheet.write(0, 6, 'url')

     for i in range(len(DATA)):

         sheet.write(i+1, 0, DATA[i]['title'])

         sheet.write(i+1, 1, DATA[i]['price'])

         sheet.write(i+1, 2, DATA[i]['sales'])

         sheet.write(i+1, 3, DATA[i]['isTmall'])

         sheet.write(i+1, 4, DATA[i]['area'])

         sheet.write(i+1, 5, DATA[i]['name'])

         sheet.write(i+1, 6, DATA[i]['url'])

     book.save('淘宝%s数据.xls'%KEYWORD)

 def main():

     for offset in range(MAX_PAGE):

         #  首页有12条异步加载的数据　api?

         if offset == 0:

             url1 = 'https://s.taobao.com/search?q={}&s={}'.format(KEYWORD,offset*44)

             html = get_html(url1)

             contents = parse_html(html)

             url2 = 'https://s.taobao.com/api?_ksTS=1532524504679_226&callback=jsonp227&ajax=true&m=customized&' \

                    'stats_click=search_radio_all:1&q={}'.format(KEYWORD)

             html = get_html(url2,2)

             contents = parse_html(html,2)

         else:

             url = 'https://s.taobao.com/search?q={}&s={}'.format(KEYWORD,offset*44)

             html = get_html(url)

             contents = parse_html(html)

     save_to_excel()

     print(len(DATA))

 if __name__ == '__main__':

     main()

爬取淘宝商品数据并保存在excel中的更多相关文章

scrapy+selenium　爬取淘宝商城商品数据存入到mongo中
1．配置信息 # 设置mongo参数 MONGO_URI = 'localhost' MONGO_DB = 'taobao' # 设置搜索关键字 KEYWORDS=['小米手机','华为手机'] # ...
使用requests、BeautifulSoup、线程池爬取艺龙酒店信息并保存到Excel中
import requests import time, random, csv from fake_useragent import UserAgent from bs4 import Beauti ...
使用requests、re、BeautifulSoup、线程池爬取携程酒店信息并保存到Excel中
import requests import json import re import csv import threadpool import time, random from bs4 impo ...
Python爬取猫眼电影100榜并保存到excel表格
首先我们前期要导入的第三方类库有; 通过猫眼电影100榜的源码可以看到很有规律如: 亦或者是: 根据规律我们可以得到非贪婪的正则表达式 """<div class ...
爬取拉勾网所有python职位并保存到excel表格对象方式
# 1.把之间案例,使用bs4,正则,xpath,进行数据提取. # 2.爬取拉钩网上的所有python职位. from urllib import request,parse import json ...
Python 爬取淘宝商品数据挖掘分析实战
Python 爬取淘宝商品数据挖掘分析实战项目内容本案例选择>> 商品类目:沙发: 数量:共100页 4400个商品: 筛选条件:天猫.销量从高到低.价格500元以上. 爬取淘宝商品 ...
Selenium+Chrome/phantomJS模拟浏览器爬取淘宝商品信息
#使用selenium+Carome/phantomJS模拟浏览器爬取淘宝商品信息 # 思路: # 第一步:利用selenium驱动浏览器,搜索商品信息,得到商品列表 # 第二步:分析商品页数,驱动浏 ...
python3编写网络爬虫16-使用selenium 爬取淘宝商品信息
一.使用selenium 模拟浏览器操作爬取淘宝商品信息之前我们已经成功尝试分析Ajax来抓取相关数据,但是并不是所有页面都可以通过分析Ajax来完成抓取.比如,淘宝,它的整个页面数据确实也是通过A ...
Python爬虫，抓取淘宝商品评论内容!
作为一个资深吃货,网购各种零食是很频繁的,但是能否在浩瀚的商品库中找到合适的东西,就只能参考评论了!今天给大家分享用python做个抓取淘宝商品评论的小爬虫! 思路我们就拿"德州扒鸡&qu ...

随机推荐

第3章：LeetCode--算法：strStr KMP算法
https://leetcode.com/problems/implement-strstr/ 28. Implement strStr() 暴力算法: int ViolentMatch(char* ...
WUSTOJ 1275: 男神的逆袭（Java）
1275: 男神的逆袭题目计算两个日期相差的天数.更多内容点击标题. 分析下面说一下我的思路(自己写的,无扩展性): 给定一个日期,首先计算这个日期是这一年的第多少天. 给定两个日期,直 ...
MongoDB用户权限管理配置
MongoDB系列第一课:MongDB简介 MongoDB系列第二课:MongDB环境搭建 MongoDB系列第三课:MongDB用户管理 MongoDB系列第四课:MongoDB数据库.集合.文档的 ...
Linux删除Tomcat中产生的所有log文件
#!/bin/bash #!/bin/bash #exact all log files #计算log文件个数log_number=`ls *.log |grep log -c` #当log文件数大于 ...
创建新表，自动授权trigger
需求一个用户下三个表,开发人员不定时进行rename表名称,create原表名称 as old_table 插入少量数据,另一个业务用户需要访问该表,由于表名称rename导致经常需要手工授权. 需 ...
（一）Centos之VMware虚拟机安装
一.下载 64位的VM12 安装包: http://pan.baidu.com/s/1bpzoXQZ 二.安装点击下一步: 老规矩,打勾,下一步: 这里我们新建一个文件夹 VM12 最好放在D盘或者 ...
CCF 201809-1 卖菜
题目: 问题描述在一条街上有n个卖菜的商店,按1至n的顺序排成一排,这些商店都卖一种蔬菜. 第一天,每个商店都自己定了一个价格.店主们希望自己的菜价和其他商店的一致,第二天,每一家商店都会根据他自己 ...
Abp 领域事件简单实践 <四> 聚合根的领域事件
聚合根有个 DomainEvents 属性. 首先聚合根是一个实体.这个实体的仓储有变化(增删改)的时候,会触发这个DomainEvents 里的事件.就像EventBus.Trigger一样. pu ...
编译openwrt backfire过程中出现的问题
参考的步骤如链接: http://www.right.com.cn/forum/forum.php?mod=viewthread&tid=124604 在make menuconfig的时候出 ...
AI 公司与比赛
科大讯飞网站:https://www.iflytek.com/ 比赛:http://challenge.xfyun.cn/2019/ AI 大学:https://www.aidaxue.com/ 华 ...

爬取淘宝商品数据并保存在excel中

爬取淘宝商品数据并保存在excel中的更多相关文章

随机推荐

热门专题