环境

操作系统：CentOS 6.7 32-bit

Python版本：2.6.6

第三方插件

selenium

PhantomJS

BeautifulSoup

代码

# -*- coding: utf-8 -*-

import sys

reload(sys)

sys.setdefaultencoding('utf-8')

'''

作者：昨夜星辰

'''

import re

import os

import time

import shutil

import requests

import subprocess

from bs4 import BeautifulSoup

from selenium import webdriver

# 拼接url

def joint_url(string):

    return 'https:' + string

# 判断文件夹是否存在，如果存在就删除，否则就创建。

def create_folder(path):

    if os.path.exists(path):

        if os.path.isdir(path):

            shutil.rmtree(path)

        else:

            os.remove(path)

    os.mkdir(path)

root_folder = '淘女郎'

create_folder(root_folder)

url = 'https://mm.taobao.com/json/request_top_list.htm?page=1'

browser = webdriver.PhantomJS()

browser.get(url)

bs = BeautifulSoup(browser.page_source, 'lxml')

for top in bs('p', 'top'):

    mm_url = joint_url(top.find('a')['href'])

    mm_name = top.find('a').text

    mm_age = top.find('em').text

    mm_city = top.find('span').text

    mm_folder = '%s/%s' % (root_folder, mm_name)

    create_folder(mm_folder)

    print '发现一位美眉，她叫做%s，今年%s，住在%s，现在开始爬取她的个人页面……' % (mm_name, mm_age, mm_city)

    browser.get(mm_url)

    bs1 = BeautifulSoup(browser.page_source, 'lxml')

    base_info = bs1.find('ul', 'mm-p-info-cell clearfix')

    info_list = base_info('span')

    result = []

    result.append('昵称：' + info_list[0].text)

    result.append('生日：' + info_list[1].text.strip())

    result.append('所在城市：' + info_list[2].text)

    result.append('职业：' + info_list[3].text)

    result.append('血型：' + info_list[4].text)

    result.append('学校/专业：' + info_list[5].text)

    result.append('风格：' + info_list[6].text)

    result.append('身高：' + base_info.find('li', 'mm-p-small-cell mm-p-height').find('p').text)

    result.append('体重：' + base_info.find('li', 'mm-p-small-cell mm-p-weight').find('p').text)

    result.append('三围：' + base_info.find('li', 'mm-p-small-cell mm-p-size').find('p').text)

    result.append('罩杯：' + base_info.find('li', 'mm-p-small-cell mm-p-bar').find('p').text)

    result.append('鞋码：' + base_info.find('li', 'mm-p-small-cell mm-p-shose').find('p').text)

    print '资料收集完毕，正在保存她的个人资料……'

    filename = '%s/%s.txt' % (mm_folder, mm_name)

    with open(filename, 'w') as f:

        f.write('\r\n'.join(result))

    print '保存完毕！现在开始爬取她的个人相册……'

    album_menu_url = joint_url(bs1.find('ul', 'mm-p-menu').find('a')['href'])

    browser.get(album_menu_url)

    time.sleep(3)

    bs2 = BeautifulSoup(browser.page_source, 'lxml')

    album_number = 1

    for album_info in bs2('div', 'mm-photo-cell-middle'):

        album_url = joint_url(album_info.find('h4').find('a')['href'])

        album_name = album_info.find('h4').find('a').text.strip()

        album_size = album_info.find('span', 'mm-pic-number').text

        print '现在开始爬取她的第%d个相册，相册名为：《%s》%s……' % (album_number, album_name, album_size)

        browser.get(album_url)

        js1 = 'return document.body.scrollHeight'

        js2 = 'window.scrollTo(0, document.body.scrollHeight)'

        old_scroll_height = 0

        while(browser.execute_script(js1) > old_scroll_height):

            old_scroll_height = browser.execute_script(js1)

            browser.execute_script(js2)

            time.sleep(3)

        bs3 = BeautifulSoup(browser.page_source, 'lxml')

        photo_number = 1

        for photo_area in bs3('div', 'mm-photoimg-area'):

            print '现在开始下载她这个相册的第%d张图片……' % photo_number,

            photo_url = joint_url(photo_area.find('a')['href'])

            browser.get(photo_url)

            bs4 = BeautifulSoup(browser.page_source, 'lxml')

            big_img_url = joint_url(bs4.find('img', id='J_MmBigImg')['src'])

            content = requests.get(big_img_url).content

            filename = '%s/%d.jpg' % (mm_folder, photo_number)

            with open(filename, 'wb') as f:

                f.write(content)

            print '下载完毕！'

            photo_number += 1

        album_number += 1

爬虫实例——爬取淘女郎相册（通过selenium、PhantomJS、BeautifulSoup爬取）的更多相关文章

数据抓取的艺术（一）：Selenium+Phantomjs数据抓取环境配置
数据抓取的艺术(一):Selenium+Phantomjs数据抓取环境配置 2013-05-15 15:08:14 分类: Python/Ruby 数据抓取是一门艺术,和其他软件不同,世界上 ...
Selenium&PhantomJS 完成爬取网络代理
Selenium模块是一套完整的Web应用程序测试系统,它包含了测试的录制(SeleniumIDE).编写及运行(Selenium Remote Control)和测试的并行处理(Selenimu G ...
动态网页爬取例子（WebCollector+selenium+phantomjs）
目标:动态网页爬取说明:这里的动态网页指几种可能:1)需要用户交互,如常见的登录操作:2)网页通过JS / AJAX动态生成,如一个html里有<div id="test" ...
动态网页爬取样例（WebCollector+selenium+phantomjs）
目标:动态网页爬取说明:这里的动态网页指几种可能:1)须要用户交互,如常见的登录操作:2)网页通过JS / AJAX动态生成.如一个html里有<div id="test" ...
selenium+phantomjs+pyquery 爬取淘宝商品信息
from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium ...
Python 之selenium+phantomJS斗鱼抓取案例
from selenium import webdriver from bs4 import BeautifulSoup import time if __name__ == '__main__': ...
scrapy爬虫实例(1)
爬虫实例对象阳光问政平台目标 : 主题,时间,内容爬取思路预先设置好items import scrapy class SuperspiderItem(scrapy.Item): title ...
Selenium+PhantomJS替代方案
问题描述: python3在使用selenium+PhantomJS动态抓取网页时,出现如下报错信息: UserWarning: Selenium support for PhantomJS has ...
Selenium+PhantomJS使用时报错原因及解决方案
问题今天在使用selenium+PhantomJS动态抓取网页时,出现如下报错信息: UserWarning: Selenium support for PhantomJS has been dep ...

随机推荐

python程序设计——面向对象程序设计：类
理解面向对象基本原则是,计算机程序由多个能够起到子程序作用的单元或对象组合而成关键性观念是,数据以及对数据的操作封装在一起,组成一个相互依存.不可分割的整体,即对象 python面向对象特性完全 ...
持续集成之TeamCity 配置
xcopy /S /Y CodeFirstDemo\CodefirstDemo.Web D:\publish\welcome\Web
python 智能合约日志操作
from __future__ import unicode_literals import json from time import sleep, time # 中文编码 def encode_s ...
codeforces 228E The Road to Berland is Paved With Good Intentions（2-SAT）
Berland has n cities, some of them are connected by bidirectional roads. For each road we know wheth ...
ZOJ 2760 How Many Shortest Path（最短路径+最大流）
Description Given a weighted directed graph, we define the shortest path as the path who has the sma ...
Train Problem（栈的应用）
Description As the new term comes, the Ignatius Train Station is very busy nowadays. A lot of studen ...
软工第三次作业——个人PSP
9.22--9.26本周例行报告 1.PSP(personal software process )个人软件过程. 类型任务预计时间开始时间结束时间中断时间实际用时准备工作学习重定向 ...
Thunder团队第六周 - Scrum会议4
Scrum会议4 小组名称:Thunder 项目名称:i阅app Scrum Master:胡佑蓉工作照片: 苗威同学在拍照,所以不在照片内. 参会成员: 王航:http://www.cnblogs ...
DWORD WORD到INT的转换
最近在做一个有关TCP/TP通信的消息解析,涉及到了这方面的转换,记录一下. 首先,如果是在网络传输.消息解析的情况下,要注意一下网络传送使用的是大端还是小端模式,这影响到我们的高低位的传输顺序. W ...
POJ 2229 计数DP
dp[i]代表是数字i的最多组合数如果i是一个奇数,i的任意一个组合都包含1,所以dp[i] = dp[i-1] 如果i是一个偶数,分两种情况讨论,一种是序列中包含1,因此dp[i]=dp[i-1]一 ...

爬虫实例——爬取淘女郎相册（通过selenium、PhantomJS、BeautifulSoup爬取）

环境

第三方插件

代码

爬虫实例——爬取淘女郎相册（通过selenium、PhantomJS、BeautifulSoup爬取）的更多相关文章

随机推荐

热门专题