python 百度图片爬虫

# -*- coding:utf-8 -*-

#https://blog.csdn.net/qq_32166627/article/details/60882964

import requests

import os

import pinyin

import simplejson

def getManyPages(keyword,pages):

    params=[]

    for i in range(30,30*pages+30,30):

        params.append({

                      'tn': 'resultjson_com',

                      'ipn': 'rj',

                      'ct': 201326592,

                      'is': '',

                      'fp': 'result',

                      'queryWord': keyword,

                      'cl': 2,

                      'lm': -1,

                      'ie': 'utf-8',

                      'oe': 'utf-8',

                      'adpicid': '',

                      'st': -1,

                      'z': '',

                      'ic': 0,

                      'word': keyword,

                      's': '',

                      'se': '',

                      'tab': '',

                      'width': '',

                      'height': '',

                      'face': 0,

                      'istype': 2,

                      'qc': '',

                      'nc': 1,

                      'fr': '',

                      'pn': i,

                      'rn': 30,

                      'gsm': '1e',

                      '': ''

                  })

    url = 'https://image.baidu.com/search/acjson'

    urls = []

    for i in params:

        #print("begin")

        try:

            rgjson = requests.get(url,params=i).json().get('data')

        except simplejson.scanner.JSONDecodeError:

            print('【错误】simplejson.scanner.JSONDecodeError ')

            continue

        #print("end")

        urls.append(rgjson)

    return urls

def getImg(dataList, localPath, keyword):

    if not os.path.exists(localPath):  # 新建文件夹

        os.mkdir(localPath)

    x = 0

    for list in dataList:

        for i in list:

            if i.get('thumbURL') != None:

                #print('download：%s' % i.get('thumbURL'))

                print("down " + str(x) + " image " + i.get('thumbURL'))

                ir = requests.get(i.get('thumbURL'))

                open(localPath +"/" + keyword +  '_%d.jpg' % x, 'wb').write(ir.content)

                x += 1

            else:

                print('image not exist')

def convert():

    fp = open("stars_list_clean.txt",'w')

    with open("stars_list.txt",'r') as face_file:

        stars_list = face_file.readlines()

        index = 0

        line_record = []

        for line in stars_list:

            line = line.replace('\r','').replace('\n','').replace('\t','')

            #print(line)

            line_split = line.strip().split(",")

            print(line_split[1])

            if line_split[1] not in line_record:

                line_record.append(line_split[1])

                fp.write('%s\n' % line_split[1])

            else:

                print(line_split[1], " is exist")

def debug():

    # with open("stars_list_clean.txt",'r') as face_file:

    #   stars_list = face_file.readlines()

    #   index = 0

    #   for line in stars_list:

    #       line = line.replace('\r','').replace('\n','').replace('\t','')

    #       keyword_english = pinyin.get(line, format="strip")

    #       keyword = line

    #       index += 1

    #       if index > 0:

    #         break

    # print(keyword)

    # keyword1 = '胡因梦'

    # if keyword == keyword1:

    #     print("yes")

    # else:

    #     print("no")

    keyword = '胡因梦'

    keyword_english = "hym"

    dataList = getManyPages(keyword,2)  # 参数1:关键字，参数2:要下载的页数

    getImg(dataList,'./hanxue', keyword_english) # 参数2:指定保存的路径

    # keyword = '韩雪'

    # dataList = getManyPages(keyword,2)  # 参数1:关键字，参数2:要下载的页数

    #getImg(dataList,'./hanxue') # 参数2:指定保存的路径

def run():

    fp = open("stars_list_en.txt",'w')

    with open("stars_list_clean.txt",'r') as face_file:

        stars_list = face_file.readlines()

        for line in stars_list:

            line = line.replace('\r','').replace('\n','').replace('\t','')

            keyword_english = pinyin.get(line, format="strip")

            fp.write('%s\n' % keyword_english)

    face_ID_index = 0

    dir = "./stars_srcimg/"

    # if os.path.exists(dir):

    #     os.system("rm -rf " + dir)

    if not os.path.exists(dir):

        os.mkdir(dir)

    pages = 5

    maxnum = pages * 30

    print(maxnum)

    for line in stars_list:

        #line.decode('utf-8').encode('gb2312')

        line = line.replace('\r','').replace('\n','').replace('\t','')

        keyword = line

        print keyword

        keyword_english = pinyin.get(keyword, format="strip")

        print keyword_english

        face_ID = str(face_ID_index) + "_" + keyword

        facesavepath = dir + str(face_ID_index) + "_" + keyword

        face_ID_index += 1

        print facesavepath

        if not os.path.exists(facesavepath):

            os.mkdir(facesavepath)

        else:

            print(keyword, " exist")

            continue

        print("down "  + keyword)

        dataList = getManyPages(keyword, pages)  # 参数1:关键字，参数2:要下载的页数

        getImg(dataList, facesavepath, face_ID) # 参数2:指定保存的路径

if __name__ == '__main__':

  debug()

  #run()

python 百度图片爬虫的更多相关文章

python写的百度图片爬虫
学了一下python正则表达式,写一个百度图片爬虫玩玩. 当技术遇上心术不正的人,就成我这样的2B青年了. python3.6开发.程序已经打包好,下载地址: http://pan.baidu.com ...
百度图片爬虫-python版-如何爬取百度图片?
上一篇我写了如何爬取百度网盘的爬虫,在这里还是重温一下,把链接附上: http://www.cnblogs.com/huangxie/p/5473273.html 这一篇我想写写如何爬取百度图片的爬虫 ...
百度图片爬虫-python版
self.browser=imitate_browser.BrowserBase() self.chance=0 self.chanc ...
【Python网络爬虫四】通过关键字爬取多张百度图片的图片
最近看了女神的新剧<逃避虽然可耻但有用>,同样男主也是一名程序员,所以很有共鸣被大只萝莉萌的一脸一脸的,我们来爬一爬女神的皂片. 百度搜索结果:新恒结衣本文主要分为4个部分: 1.下载 ...
Python爬虫：通过关键字爬取百度图片
使用工具:Python2.7 点我下载 scrapy框架 sublime text3 一.搭建python(Windows版本) 1.安装python2.7 ---然后在cmd当中输入python,界 ...
如何用Python爬虫实现百度图片自动下载？
Github:https://github.com/nnngu/LearningNotes 制作爬虫的步骤制作一个爬虫一般分以下几个步骤: 分析需求分析网页源代码,配合开发者工具编写正则表达式或 ...
python爬虫获取百度图片（没有精华，只为娱乐）
python3.7,爬虫技术,获取百度图片资源,msg为查询内容,cnt为查询的页数,大家快点来爬起来.注:现在只能爬取到百度的小图片,以后有大图片的方法,我会陆续发贴. #!/usr/bin/env ...
python爬虫-爬取百度图片
python爬虫-爬取百度图片(转) #!/usr/bin/python# coding=utf-8# 作者 :Y0010026# 创建时间 :2018/12/16 16:16# 文件 :spider ...
python爬虫之爬取百度图片
##author:wuhao##爬取指定页码的图片,如果需要爬取某一类的所有图片,整体框架不变,但需要另作分析#import urllib.requestimport urllib.parseimpo ...

随机推荐

【Android】1.0 第1章 C#之Android手机App开发
分类:C#.Android.VS2015:创建日期:2016-01-20 目前Android在全世界市场上大约有75%的占有率,国人Android手机的持有比例更甚,甚至达到90%以上.因此搞计算机的 ...
HTML5学习笔记（六）：CSS基本样式
背景需要注意:背景的所有属性都不会向下进行继承. 背景色我们可以设定一个纯色为背景色. p {background-color: red;} a {background-color: #ff000 ...
FLINK 案例分析
基于Flink流处理的动态实时超大规模用户行为分析 https://zhuanlan.zhihu.com/p/31548501 基于Flink流处理的动态实时超大规模用户行为分析 https://zh ...
Python Redis pipeline操作和Redis乐观锁保持数据一致性
Redis是建立在TCP协议基础上的CS架构,客户端client对redis server采取请求响应的方式交互. redis 乐观锁:也可理解为版本号比较机制,主要是说在读取数据逇时候同时读取其版本 ...
如何解压POSIX tar archive文件
下载了一个xxx.gz的文件,使用x xxx.gz(zsh的x插件,十分之好用,再也不用担心tar后面该加哪些参数了)的命令解压,然后出现了一个文件,本以为解压后是一个文件夹:然后一脸蒙逼~ 突然又想 ...
Oracle数据库密码过期
按照如下步骤进行操作:1.查看用户的proifle是哪个,一般是default: SQL>SELECT USERNAME,PROFILE FROM DBA_USERS; 2.查看指定概要文件(如 ...
对象序列化为何要定义serialVersionUID的来龙去脉
在很多应用中,需要对某些对象进行序列化,让它们离开内存空间,入住物理硬盘,以便长期保存.比如最常见的是Web服务器中的Session对象,当有10万用户并发访问,就有可能出现10万个Session对象 ...
oracle的db link
cd $ORACLE_HOME/network/admin vi tnsnames.ora 添加 CCPBS_19 = (DESCRIPTION = (ADDRESS_LIST = ...
基于CSS3鼠标滑过放大突出效果
还记得之前分享过一款CSS3图片悬停放大特效,效果非常不错.今天我们要再来分享一款类似的CSS鼠标滑过放大突出效果,只不过之前那个是图片,这次是色块,其实掌握了其CSS原理,任何网页元素都可以实现这种 ...
SpringMVC没有接受到参数的坑
其实说上来也不是SpringMVC的坑. 相同的一份代码,我在windows上用mvn打成jar放到linux上执行,POST请求可以接收到参数: 但是我直接在linux上从git拉取分支,并在lin ...

python 百度图片爬虫

python 百度图片爬虫的更多相关文章

随机推荐

热门专题