简单JAVA爬虫51Jobs

使用Jsoup工具，它是一个HTML解析器，可以直接直接解析某个地址或者HTML文件。还可通过Dom,CSS以及类似JQuery的操作方法操作数据。

Jsoup官方文档地址：https://jsoup.org/cookbook/introduction/parsing-a-document

注意：出现乱码时，需要查看编码方式网页的编码方式，使用它的编码方式解码。使用表单传输中文数据时有些网站需要进行url编码才能正常传输中文=。=

主要代码如下：

package com.galoliy.spider.maven_spider.domain;

import java.io.BufferedInputStream;

import java.io.BufferedOutputStream;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.io.OutputStream;

import java.io.UnsupportedEncodingException;

import java.net.URLEncoder;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import org.jsoup.Connection;

import org.jsoup.Connection.Method;

import org.jsoup.Connection.Response;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

public class Cat5jobs {

    public Document getResultPage(String url,String keyword) throws UnsupportedEncodingException {

        Document doc = null;

        //multipart/form-data 编码类型转换，必须进行转换,不然会导致POST里的keyword乱码

        //Multipart/form-data code type conversion must be converted, otherwise it will cause keyword confusion in POST.

        keyword = URLEncoder.encode(keyword, "gbk");

        try {

            //获取主页

            //Get index page

            Response resp = Jsoup.connect(url).method(Method.GET).execute();

            doc = resp.parse();

            //获取查询结果页的跳转链接

            //Get query results jump page link

            String    actionPath = doc.select("form").attr("action");

             Connection con = Jsoup.connect(actionPath)

                    .data("keyword", keyword)

                    .userAgent("Mozilla")

                    .cookies(resp.cookies())

                    .header("Accept-Language", "zh-CN,zh;q=0.9")

                    .timeout(300000);

             //得到查询结果页面

             //Get query results page

            doc = con.method(Method.POST).execute().parse();

        } catch (IOException e) {

            e.printStackTrace();

        }

        return doc;

    }

    public void getResult(String url,String keyword,String dir,String fileName) {

        Document doc = null;

        File htmlPath = null;

        File txtPath = null;

        String htmlFilePath = dir + fileName + ".htm";

        String txtFilePath = dir + fileName + "2.txt";

        txtPath = new File(txtFilePath);

        htmlPath = new File(htmlFilePath);

        Map map = null;

        String printSrc = "";

        try {

            //本地如果有html文件则解析该文件,打印内容并储存一个txt文件

            //If there is a HTML file in the local area, parse the file, print the contents and store a TXT file.

            if(!txtPath.exists() && htmlPath.exists()) { 

                doc = Jsoup.parse(htmlPath, "utf-8");

                if(!doc.children().isEmpty())

                    System.out.println("File not empty");

                map = Screen51Jobs(doc);

                printSrc = printScreen(map);

                saveFile(printSrc, txtFilePath);

                System.out.println(printSrc);

            //如果本地有html和txt文件则读取txt文件内容，否则抛出IOException

            //If you have HTML and txt files locally, you can read the contents of the txt file, otherwise throw IOException.

            }else if(txtPath.exists() && htmlPath.exists()) {

                System.out.println("File not empty");

                printSrc = printScreen(txtPath);

                System.out.println(printSrc);

            }else

                throw new IOException("NOT HTML FILE");

        } catch (IOException e) { //在catch块里执行爬虫并且把文件保存在本地,Execute crawler in catch block and save the file locally.

            System.out.println("file not found");

            try {

                //从网址上获取查询结果页面

                //Get query results page from web address

                doc = this.getResultPage(url,keyword);

                htmlPath.createNewFile();

                //存储html文件

                //Save html file

                saveFile(doc.toString(),htmlFilePath);

                map = Screen51Jobs(doc);

                String printStr = printScreen(map);

                if(!htmlPath.exists())

                    htmlPath.createNewFile();

                //存储txt文件

                //Save txt file

                saveFile(printStr, txtFilePath);

                System.out.println(printSrc);

            } catch (IOException ex) {

                ex.printStackTrace();

            }

        }

    }

    private String printScreen(File path) throws IOException{

        StringBuilder printSrc = new StringBuilder();

        InputStream in = new FileInputStream(path);

        BufferedInputStream bis = new BufferedInputStream(in);

        int len = 0;

        byte[] bytes = new byte[1024 * 8];

        while((len = bis.read(bytes, 0, bytes.length)) != -1) {

            printSrc.append(new String(bytes,0,bytes.length));

        }

        bis.close();

        return printSrc.toString();

    }

    private String printScreen(Map<?,?> screen) throws IOException {

        StringBuilder sb = new StringBuilder();

        String p = "\r\n";

        sb.append(p + " KeyWord:" + screen.get("keyword") + p + p +" Total query data:"

                    + screen.get("totalquerydata") + p + p + " Recruitment info:");

        List list = (ArrayList)screen.get("recruitmentlist");

        for (Object o : list) {

            Map map = (HashMap<String,Object>)o;

            for (Object obj : map.entrySet()) {

                Map.Entry<String, Object> entry = (Map.Entry<String, Object>)obj;

                sb.append(p + entry.getKey() + " == " + entry.getValue());

            }

            sb.append(p);

        }

        return sb.toString();

    }

    @SuppressWarnings({ "rawtypes", "unchecked" })

    private Map<?,?> Screen51Jobs(Document doc){

        Map screen = new HashMap<String,Object>(); 

        Elements resultList = doc.select("div[class=dw_table]div[id=resultList]");

        Elements findKeyword = resultList.select("div[class=sbox]");

        Elements totalQueryData = resultList.select("div[class=rt]div:matchesOwn(^共)");

        Elements recruitmentInfo = resultList.select("div[class=el]");

        screen.put("keyword", findKeyword.text());

        screen.put("totalquerydata", totalQueryData.text());

        List recruitmentList = new ArrayList<Map<String,String>>();

        Map m = null;

        for (Element e : recruitmentInfo) {

            m = new HashMap<String,Object>();

            m.put("position",e.select("p[class~=^t1]").text());

            m.put("href", e.select("a").attr("href"));

            m.put("corporatename", e.select("a").text());

            m.put("address", e.select("span[class=t3]").text());

            m.put("salary", e.select("span[class=t4]").text());

            m.put("releasedate", e.select("span[class=t5]").text());

            recruitmentList.add(m);

        }

        screen.put("recruitmentlist", recruitmentList);

        return screen;

    }

    private void saveFile(String src,String path) throws IOException {

    //    InputStream in = new FileInputStream(path);

        OutputStream out = new FileOutputStream(path);

        BufferedOutputStream bos = new BufferedOutputStream(out);

        byte[] bytes = src.getBytes("utf-8");

        bos.write(bytes, 0, bytes.length);

    }

简单JAVA爬虫51Jobs的更多相关文章

一个简单java爬虫爬取网页中邮箱并保存
此代码为一十分简单网络爬虫,仅供娱乐之用. java代码如下: package tool; import java.io.BufferedReader; import java.io.File; im ...
超简单的java爬虫
最简单的爬虫,不需要设定代理服务器,不需要设定cookie,不需要http连接池,使用httpget方法,只是为了获取html代码... 好吧,满足这个要求的爬虫应该是最基本的爬虫了.当然这也是做复杂 ...
java简单web爬虫(网页图片)
java简单web爬虫(网页图片)效果,执行main()方法后图片就下载道C盘的res文件夹中.没有的话创建一个文件夹代码里的常量根据自己的需求修改,代码附到下面. package com.sinit ...
学校实训作业：Java爬虫（WebMagic框架）的简单操作
项目名称:java爬虫项目技术选型:Java.Maven.Mysql.WebMagic.Jsp.Servlet 项目实施方式:以认知java爬虫框架WebMagic开发为主,用所学java知识完成指 ...
webmagic的设计机制及原理-如何开发一个Java爬虫
之前就有网友在博客里留言,觉得webmagic的实现比较有意思,想要借此研究一下爬虫.最近终于集中精力,花了三天时间,终于写完了这篇文章.之前垂直爬虫写了一年多,webmagic框架写了一个多月,这方 ...
JAVA爬虫 WebCollector
JAVA爬虫 WebCollector 爬虫简介: WebCollector是一个无须配置.便于二次开发的JAVA爬虫框架(内核),它提供精简的的API,只需少量代码即可实现一个功能强大的爬虫. 爬虫 ...
爬虫入门手写一个Java爬虫
本文内容涞源于罗刚老师的书籍 << 自己动手写网络爬虫一书 >> ; 本文将介绍 1: 网络爬虫的是做什么的? 2: 手动写一个简单的网络爬虫; 1: 网络爬虫是做 ...
JAVA爬虫实践（实践三：爬虫框架webMagic和csdnBlog爬虫）
WebMagic WebMagic是一个简单灵活的Java爬虫框架.基于WebMagic,你可以快速开发出一个高效.易维护的爬虫. 采用HttpClient可以实现定向的爬虫,也可以自己编写算法逻辑来 ...
java爬虫系列第一讲-爬虫入门
1. 概述 java爬虫系列包含哪些内容? java爬虫框架webmgic入门使用webmgic爬取 http://ady01.com 中的电影资源(动作电影列表页.电影下载地址等信息) 使用web ...

随机推荐

Chapter6 胞内信号网络
一.一条从细胞表面到细胞核的通路二.Ras蛋白处于复杂信号级联的中心位置胞外信号→酪氨酸激酶受体→Shc→Grb→Sos→Ras 三.酪氨酸的磷酸化控制着许多胞内信号蛋白的定位与活动 Src蛋白的 ...
设计模式之观察者模式（c++）
Observer 模式应该可以说是应用最多.影响最广的模式之一,因为 Observer 的一个实例 Model/View/Control( MVC) 结构在系统开发架构设计中有着很重要的地位和意义, ...
channel和Stream的对比
这篇文章主要想总结下NIO的channel的传统io中的stream的差别在哪.网上找了很多文章,都感觉只是说了概念.然后自己大概看了下源码,结合概念,整理一下.有些地方可能不是很准确,也希望可以给点 ...
hadoop2.4.0伪分布式搭建以及分布式关机重启后datanode没起来的解决办法
1.准备Linux环境 1.0点击VMware快捷方式,右键打开文件所在位置 -> 双击vmnetcfg.exe -> VMnet1 host-only ->修改subnet ip ...
文件描述符fd、文件指针fp和vfork()
1. fd:在形式上是一个非负整数.实际上他是一个索引值.指向kernal为每一个进程所维护的该进程打开文件的记录表. 当程序打开一个文件或者创建一个新文件的时候kernal向进程返回一个文件描述符. ...
pgsqls修改表字段长度
alter table T_RPACT_PROTO_EDIT_RECORD alter column remark type VARCHAR(1024); 需要注意type关键字
[CocoaPods]使用Pod Lib创建
入门我们将使用pod lib create引导过程来创建整个pod .那么让我们从初始命令开始: pod lib create MyLibrary 注意:要使用您自己的pod-template,您可 ...
SQL-2--TRIGGER
触发器TRIGGER 是一个被指定关联到一个表的数据库对象,当对一个表的特定事件出现时,它将被激活. 触发器是数据库响应 INSERT , UPDATE, DELITE 语句而自动执行的一条SQL语句 ...
【LeetCode】1. 两数之和
题目给定一个整数数组 nums 和一个目标值 target,请你在该数组中找出和为目标值的那两个整数,并返回他们的数组下标.你可以假设每种输入只会对应一个答案.但是,你不能重复利用这个数组中同样 ...
纯JavaScript实现俄罗斯方块（详细注释，ES6）
借鉴了慕课网的课程<基于websocket的火拼俄罗斯(单机版)>虽然改动比较多,但是还是核心部分没有改,加了一些不怎么好听的声音,和看起来并不好看的界面. CSS部分基本是瞎写的,因为对 ...

简单JAVA爬虫51Jobs

简单JAVA爬虫51Jobs的更多相关文章

随机推荐

热门专题