python 百度图片爬虫

# -*- coding:utf-8 -*-

#https://blog.csdn.net/qq_32166627/article/details/60882964

import requests

import os

import pinyin

import simplejson

def getManyPages(keyword,pages):

    params=[]

    for i in range(30,30*pages+30,30):

        params.append({

                      'tn': 'resultjson_com',

                      'ipn': 'rj',

                      'ct': 201326592,

                      'is': '',

                      'fp': 'result',

                      'queryWord': keyword,

                      'cl': 2,

                      'lm': -1,

                      'ie': 'utf-8',

                      'oe': 'utf-8',

                      'adpicid': '',

                      'st': -1,

                      'z': '',

                      'ic': 0,

                      'word': keyword,

                      's': '',

                      'se': '',

                      'tab': '',

                      'width': '',

                      'height': '',

                      'face': 0,

                      'istype': 2,

                      'qc': '',

                      'nc': 1,

                      'fr': '',

                      'pn': i,

                      'rn': 30,

                      'gsm': '1e',

                      '': ''

                  })

    url = 'https://image.baidu.com/search/acjson'

    urls = []

    for i in params:

        #print("begin")

        try:

            rgjson = requests.get(url,params=i).json().get('data')

        except simplejson.scanner.JSONDecodeError:

            print('【错误】simplejson.scanner.JSONDecodeError ')

            continue

        #print("end")

        urls.append(rgjson)

    return urls

def getImg(dataList, localPath, keyword):

    if not os.path.exists(localPath):  # 新建文件夹

        os.mkdir(localPath)

    x = 0

    for list in dataList:

        for i in list:

            if i.get('thumbURL') != None:

                #print('download：%s' % i.get('thumbURL'))

                print("down " + str(x) + " image " + i.get('thumbURL'))

                ir = requests.get(i.get('thumbURL'))

                open(localPath +"/" + keyword +  '_%d.jpg' % x, 'wb').write(ir.content)

                x += 1

            else:

                print('image not exist')

def convert():

    fp = open("stars_list_clean.txt",'w')

    with open("stars_list.txt",'r') as face_file:

        stars_list = face_file.readlines()

        index = 0

        line_record = []

        for line in stars_list:

            line = line.replace('\r','').replace('\n','').replace('\t','')

            #print(line)

            line_split = line.strip().split(",")

            print(line_split[1])

            if line_split[1] not in line_record:

                line_record.append(line_split[1])

                fp.write('%s\n' % line_split[1])

            else:

                print(line_split[1], " is exist")

def debug():

    # with open("stars_list_clean.txt",'r') as face_file:

    #   stars_list = face_file.readlines()

    #   index = 0

    #   for line in stars_list:

    #       line = line.replace('\r','').replace('\n','').replace('\t','')

    #       keyword_english = pinyin.get(line, format="strip")

    #       keyword = line

    #       index += 1

    #       if index > 0:

    #         break

    # print(keyword)

    # keyword1 = '胡因梦'

    # if keyword == keyword1:

    #     print("yes")

    # else:

    #     print("no")

    keyword = '胡因梦'

    keyword_english = "hym"

    dataList = getManyPages(keyword,2)  # 参数1:关键字，参数2:要下载的页数

    getImg(dataList,'./hanxue', keyword_english) # 参数2:指定保存的路径

    # keyword = '韩雪'

    # dataList = getManyPages(keyword,2)  # 参数1:关键字，参数2:要下载的页数

    #getImg(dataList,'./hanxue') # 参数2:指定保存的路径

def run():

    fp = open("stars_list_en.txt",'w')

    with open("stars_list_clean.txt",'r') as face_file:

        stars_list = face_file.readlines()

        for line in stars_list:

            line = line.replace('\r','').replace('\n','').replace('\t','')

            keyword_english = pinyin.get(line, format="strip")

            fp.write('%s\n' % keyword_english)

    face_ID_index = 0

    dir = "./stars_srcimg/"

    # if os.path.exists(dir):

    #     os.system("rm -rf " + dir)

    if not os.path.exists(dir):

        os.mkdir(dir)

    pages = 5

    maxnum = pages * 30

    print(maxnum)

    for line in stars_list:

        #line.decode('utf-8').encode('gb2312')

        line = line.replace('\r','').replace('\n','').replace('\t','')

        keyword = line

        print keyword

        keyword_english = pinyin.get(keyword, format="strip")

        print keyword_english

        face_ID = str(face_ID_index) + "_" + keyword

        facesavepath = dir + str(face_ID_index) + "_" + keyword

        face_ID_index += 1

        print facesavepath

        if not os.path.exists(facesavepath):

            os.mkdir(facesavepath)

        else:

            print(keyword, " exist")

            continue

        print("down "  + keyword)

        dataList = getManyPages(keyword, pages)  # 参数1:关键字，参数2:要下载的页数

        getImg(dataList, facesavepath, face_ID) # 参数2:指定保存的路径

if __name__ == '__main__':

  debug()

  #run()

python 百度图片爬虫的更多相关文章

python写的百度图片爬虫
学了一下python正则表达式,写一个百度图片爬虫玩玩. 当技术遇上心术不正的人,就成我这样的2B青年了. python3.6开发.程序已经打包好,下载地址: http://pan.baidu.com ...
百度图片爬虫-python版-如何爬取百度图片?
上一篇我写了如何爬取百度网盘的爬虫,在这里还是重温一下,把链接附上: http://www.cnblogs.com/huangxie/p/5473273.html 这一篇我想写写如何爬取百度图片的爬虫 ...
百度图片爬虫-python版
self.browser=imitate_browser.BrowserBase() self.chance=0 self.chanc ...
【Python网络爬虫四】通过关键字爬取多张百度图片的图片
最近看了女神的新剧<逃避虽然可耻但有用>,同样男主也是一名程序员,所以很有共鸣被大只萝莉萌的一脸一脸的,我们来爬一爬女神的皂片. 百度搜索结果:新恒结衣本文主要分为4个部分: 1.下载 ...
Python爬虫：通过关键字爬取百度图片
使用工具:Python2.7 点我下载 scrapy框架 sublime text3 一.搭建python(Windows版本) 1.安装python2.7 ---然后在cmd当中输入python,界 ...
如何用Python爬虫实现百度图片自动下载？
Github:https://github.com/nnngu/LearningNotes 制作爬虫的步骤制作一个爬虫一般分以下几个步骤: 分析需求分析网页源代码,配合开发者工具编写正则表达式或 ...
python爬虫获取百度图片（没有精华，只为娱乐）
python3.7,爬虫技术,获取百度图片资源,msg为查询内容,cnt为查询的页数,大家快点来爬起来.注:现在只能爬取到百度的小图片,以后有大图片的方法,我会陆续发贴. #!/usr/bin/env ...
python爬虫-爬取百度图片
python爬虫-爬取百度图片(转) #!/usr/bin/python# coding=utf-8# 作者 :Y0010026# 创建时间 :2018/12/16 16:16# 文件 :spider ...
python爬虫之爬取百度图片
##author:wuhao##爬取指定页码的图片,如果需要爬取某一类的所有图片,整体框架不变,但需要另作分析#import urllib.requestimport urllib.parseimpo ...

随机推荐

【Android】1.2 创建Android模拟器
分类:C#.Android.VS2015: 创建日期:2016-01-20 调试手机应用程序一般先用模拟器来实现,只是因为每次都发布到手机上调试太麻烦了.当应用程序在模拟器上调试没错后,再发布到手机 ...
Push UIViewController with different orientation to previous
转自:http://stackoverflow.com/questions/6695837/push-uiviewcontroller-with-different-orientation-to-pr ...
schema中字段类型的定义
当schema中字段类型为String时,保存的时候如果该字段为Number也可以保存成功,mongoose会自动将其转换为数字字符串. 当schema中字段类型为Number时,保存的时候如果该字段 ...
Flink papers
Around 2009 the Stratosphere research project started at the TU Berlin which a few years later was s ...
每日英语：What You Like Best: Shopping, Food and Tech
In a year that featured one of history's biggest corporate buyouts, a stock-market surge reminiscent ...
SQLite - Java
安装在 Java 程序中使用 SQLite 之前,我们需要确保机器上已经有 SQLite JDBC Driver 驱动程序和 Java.可以查看 Java 教程了解如何在计算机上安装 Java.现在 ...
UNIX环境高级编程 apue.h头文件的配置
http://jimslinbing.blog.163.com/blog/static/85054319201292712414518/ 1.到http://www.apuebook.com下载源码2 ...
SPSS简单使用
当我们的调查问卷在把调查数据拿回来后,我们该做的工作就是用相关的统计软件进行处理,在此,我们以spss为处理软件,来简要说明一下问卷的处理过程,它的过程大致可分为四个过程:定义变量.数据录入.统计分析 ...
Python实例获取mp3文件的tag信息
下面利用一个python的实例程序,来学习python.这个程序的目的就是分析出所有MP3文件的Tag信息并输出. 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 ...
算法篇---java经典问题！！！
问题一:==与equal的区别? ==和 equals 都是比较的,而前者是运算符,后者则是一个方法,基本数据类型和引用数据类型都可以使用运算符==,而只有引用类型数据才可以使用 equals,下面具 ...

python 百度图片爬虫

python 百度图片爬虫的更多相关文章

随机推荐

热门专题