这里从车商网上进行数据抓取,请保持良好的职业道德不要将数据用于商业途径。工信部官网有汽车方面的公告目录,那里有最全的pdf或word数据,鉴于word和pdf解析的繁琐和耗时,我暂时用这个网站的数据进行测试。

Spider主要代码:

package tk.mybatis.springboot.util;

import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.mysql.jdbc.Connection;
import com.mysql.jdbc.PreparedStatement;
import tk.mybatis.springboot.model.AutobatchDirectory;
import tk.mybatis.springboot.service.AutobatchDirectoryService; public class AutoBatchSpider { // 原始来源http://www.cn357.com/notice_list/
public static final String web = "http://www.cn357.com"; private static final int timeOut=30000; /**
* 获取汽车公告批次
*
* @throws InterruptedException
* @throws IOException
*
*/
public static void getBatchFromUrl(String listurl) throws InterruptedException, IOException {
Document doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0").timeout(timeOut).get();
Element batchElement = doc.getElementById("noticeList");
Elements elements = batchElement.children();
List<String> urls = new ArrayList<String>();
for (Element element : elements) {
String href = element.attr("href");
String text = element.text();
if (!href.startsWith("http://")) {
StringBuffer sb = new StringBuffer();
String batchUrl = sb.append(web).append(href).toString();
System.out.println(text + "\t\t" + batchUrl);
urls.add(batchUrl);
} else {
String batchUrl = href;
urls.add(batchUrl);
System.out.println(text + "\t\t" + batchUrl);
}
}
// 反转排序
Collections.reverse(urls); System.out.println("总批次数:" + urls.size());
for (int j = 0, k = urls.size(); j < k; j++) {
String url = urls.get(j);
System.out.println("第" + url.split("_")[1] + "批数据获取进度:" + (double) Math.round(j * 100 / k) / 100);
// 获取分页链接
List<Map<String,Object>> list=getDetailsPageFromBatchItems(url);
for (Map<String, Object> map : list) {
// 获取详细数据对象
String detailUrl =(String)map.get("href");
AutobatchDirectory autobatchDirectory=getDetailOfAutoBatchInfo(detailUrl);
try {
saveByJdbc(autobatchDirectory);
} catch (SQLException e) {
e.printStackTrace();
}
}
} } /**
* 获取汽车公告批次
*
* @throws InterruptedException
* @throws IOException
*
*/
public static void getBatchFromUrl(AutobatchDirectoryService autobatchDirectoryService,String listurl) throws InterruptedException, IOException {
Document doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0").timeout(timeOut).get();
Element batchElement = doc.getElementById("noticeList");
Elements elements = batchElement.children();
List<String> urls = new ArrayList<String>();
for (Element element : elements) {
String href = element.attr("href");
String text = element.text();
if (!href.startsWith("http://")) {
StringBuffer sb = new StringBuffer();
String batchUrl = sb.append(web).append(href).toString();
System.out.println(text + "\t\t" + batchUrl);
urls.add(batchUrl);
} else {
String batchUrl = href;
urls.add(batchUrl);
System.out.println(text + "\t\t" + batchUrl);
}
}
// 反转排序
Collections.reverse(urls); System.out.println("总批次数:" + urls.size());
for (int j = 0, k = urls.size(); j < k; j++) {
String url = urls.get(j);
System.out.println("第" + url.split("_")[1] + "批数据获取进度:" + (double) Math.round(j * 100 / k) / 100);
// 获取分页链接
List<Map<String,Object>> list=getDetailsPageFromBatchItems(url);
for (Map<String, Object> map : list) {
// 获取详细数据对象
String detailUrl =(String)map.get("href");
AutobatchDirectory autobatchDirectory=getDetailOfAutoBatchInfo(detailUrl);
autobatchDirectoryService.save(autobatchDirectory);
}
} } /**
* 获取所有汽车公告批次详细分页条目
*
* @throws InterruptedException
* @throws IOException
*
*/
public static List<Map<String, Object>> getDetailsPageFromBatchItems(String url)
throws InterruptedException, IOException { Document doc = Jsoup.connect(url).userAgent("Mozilla/5.0").timeout(timeOut).get();
Thread.sleep(1000);
/************ 获取分页参数 ************/
Elements pages = doc.getElementsByAttributeValue("class", "page");
int max = 1;
for (Element element : pages) {
for (Element children : element.getElementsByTag("a")) {
String text = children.text();
if (!text.equals("") && !text.equals("下一页") && !text.equals("上一页")) {
int index = Integer.valueOf(children.text());
if (index > max) {
max = index;
}
}
}
}
int totalBatchPage = max;
List<Map<String, Object>> list = new ArrayList<Map<String, Object>>();
/************ 获取分页数据 ************/
for (int i = 1; i <= totalBatchPage; i++) {
System.out.println("分页数据获取进度:" + (double) Math.round(i * 100 / max) / 100);
Thread.sleep(2000);// 等待2秒开始访问
doc = Jsoup.connect(url + "_" + i).userAgent("Mozilla/5.0").timeout(timeOut).get();
Elements divs = doc.getElementsByAttributeValue("class", "noticeLotItem");
Map<String, Object> map = new HashMap<String, Object>();
for (Element div : divs) {
Elements divChildren = div.children();
boolean isHref = false;
for (Element element : divChildren) {
String claszType = element.attr("class");
if (claszType.equals("m")) {
Element a = element.getElementsByAttribute("href").get(0);
String href = a.attr("href");
if ("".equals(href)) {
continue;
}
map = new HashMap<String, Object>();
if (!href.startsWith("http://")) {
map.put("href", web + href);
} else {
map.put("href", href);
}
map.put("href_text", a.text());
isHref = true;
} else if (claszType.equals("c")) {
if (!isHref) {
continue;
}
map.put("type_text", element.text());
}
}
if (map != null) {
list.add(map);
map = null;
}
}
}
System.out.println("Total rows:" + list.size());
return list;
} /**
* 解析详细的车型车厂信息
*
* @throws IOException
* @throws InterruptedException
*/
public static AutobatchDirectory getDetailOfAutoBatchInfo(String url) throws IOException, InterruptedException {
Thread.sleep(2000);
Document doc = Jsoup.connect(url).userAgent("Mozilla/5.0").timeout(timeOut).get();
Elements tableElements = doc.getElementsByAttributeValue("class", "noticeAttr mt5").get(0)
.getElementsByTag("tbody").get(0).children();
AutobatchDirectory autobatchDirectory = null;
if (tableElements.size() == 22) {
autobatchDirectory = new AutobatchDirectory();
// 22行表示没有发动机参数
System.out.println("......22行表示没有发动机参数.....解析中.......");
for (int i = 1, j = tableElements.size(); i <= j; i++) {
Element element = tableElements.get(i - 1);
if (i == j) {
Elements children = element.children();
String values = children.get(1).text();
autobatchDirectory.setfRemark(values);
System.out.println(values);
} else {
// 设置对象属性值
setPropertyToObject(autobatchDirectory, i, element);
}
}
} else if (tableElements.size() == 23) {
autobatchDirectory = new AutobatchDirectory();
// 23行表示有发动机参数
System.out.println("......23行表示有发动机参数.....解析中.......");
for (int i = 1, j = tableElements.size(); i <= j; i++) {
Element element = tableElements.get(i - 1);
if (i == 22) {
Element valueTr = element.getElementsByTag("table").get(0);
Elements children = valueTr.getElementsByTag("tbody").get(0).children().get(1).children();
int count = 0;
for (Element child : children) {
switch (count) {
case 0:
// 发动机型号
autobatchDirectory.setfEngineType(child.text());
break;
case 1:
// 发动机生产企业
autobatchDirectory.setfEnginePro(child.text());
break;
case 2:
// 发动机商标
autobatchDirectory.setfEngineTrademark(child.text());
break;
case 3:
// 排量
autobatchDirectory.setfOutputVolume(child.text());
break;
case 4:
// 功率
autobatchDirectory.setfPower(child.text());
break;
default:
break;
}
count++;
}
} else if (i == j) {
Elements children = element.children();
String values = children.get(1).text();
autobatchDirectory.setfRemark(values);
System.out.println(values);
} else {
// 设置对象属性值
setPropertyToObject(autobatchDirectory, i, element);
}
}
}
return autobatchDirectory;
} /**
* 设置对象属性值
*/
private static void setPropertyToObject(AutobatchDirectory autobatchDirectory, int i, Element element) {
Elements children = element.children();
String values = children.get(1).text() + " " + children.get(3).text();
System.out.println(values);
switch (i) {
case 1:
// 公告型号 公告批次
autobatchDirectory.setfAnnouType(children.get(1).text());
autobatchDirectory.setfAnnouBatch(children.get(3).text());
break;
case 2:
// 品牌 类型
autobatchDirectory.setfVehicleBrand(children.get(1).text());
autobatchDirectory.setfVehicleType(children.get(3).text());
break;
case 3:
// 额定质量 32000,32700 总质量
autobatchDirectory.setfMaxMass(children.get(1).text());
autobatchDirectory.setfTotalMass(children.get(3).text());
break;
case 4:
// 整备质量 8000,7300 燃料种类
autobatchDirectory.setfWholeMass(children.get(1).text());
autobatchDirectory.setfFuelType(children.get(3).text());
break;
case 5:
// 排放依据标准 轴数
autobatchDirectory.setfBlowoffStandard(children.get(1).text());
autobatchDirectory.setfAxleNumber(children.get(3).text());
break;
case 6:
// 轴距 7250+1310+1310,6850+1310+1310 轴荷
autobatchDirectory.setfWheelbase(children.get(1).text());
autobatchDirectory.setfAxleWeight(children.get(3).text());
break;
case 7:
// 弹簧片数 -/8/8/8,-/4/4/4,-/7/7/7,-/-/-/-,-/10/10/10 轮胎数
autobatchDirectory.setfSpringNumber(children.get(1).text());
autobatchDirectory.setfTyreNumber(children.get(3).text());
break;
case 8:
// 轮胎规格 11.00R20 12PR,11.00-20 12PR,12R22.5 12PR 接近离去角
autobatchDirectory.setfTyreSize(children.get(1).text());
autobatchDirectory.setfDepartureAngle(children.get(3).text());
break;
case 9:
// 前悬后悬 -/2080,-/1730,-/2480,-/2130 前轮距
autobatchDirectory.setfFrearSuspension(children.get(1).text());
autobatchDirectory.setfFrontGauge(children.get(3).text());
break;
case 10:
// 后轮距 1830/1830/1830 识别代号
autobatchDirectory.setfBackGauge(children.get(1).text());
autobatchDirectory.setfVinCode(children.get(3).text());
break;
case 11:
// 整车长 13000 整车宽
autobatchDirectory.setfVehicleLength(children.get(1).text());
autobatchDirectory.setfVehicleWidth(children.get(3).text());
break;
case 12:
// 整车高 2970,3030,2760 货厢长
autobatchDirectory.setfVehicleHeight(children.get(1).text());
autobatchDirectory.setfCargoLength(children.get(3).text());
break;
case 13:
// 货厢宽 2400,2450,2470 货厢高
autobatchDirectory.setfCargoWidth(children.get(1).text());
autobatchDirectory.setfCargoHeight(children.get(3).text());
break;
case 14:
// 最高车速 额定载客
autobatchDirectory.setfMaxSpeed(children.get(1).text());
autobatchDirectory.setfMaxPassenger(children.get(3).text());
break;
case 15:
// 驾驶室准乘人数 转向形式
autobatchDirectory.setfCabNumber(children.get(1).text());
autobatchDirectory.setfSteeringType(children.get(3).text());
break;
case 16:
// 准拖挂车总质量 载质量利用系数
autobatchDirectory.setfTotalMassTrailer(children.get(1).text());
autobatchDirectory.setfLoadMassFactor(children.get(3).text());
break;
case 17:
// 半挂车鞍座最大承载质量 16000,16150 企业名称
autobatchDirectory.setfMaxSemitrailer(children.get(1).text());
autobatchDirectory.setfEnterpriseName(children.get(3).text());
break;
case 18:
// 企业地址 深圳市龙岗区坪山镇锦龙大道1号 电话号码
autobatchDirectory.setfEnterpriseAddress(children.get(1).text());
autobatchDirectory.setfEnterprisePhone(children.get(3).text());
break;
case 19:
// 传真号码 (0755)89663298 邮政编码
autobatchDirectory.setfEnterpriseFax(children.get(1).text());
autobatchDirectory.setfPostcode(children.get(3).text());
break;
case 20:
// 底盘1 底盘2
autobatchDirectory.setfChassisOne(children.get(1).text());
autobatchDirectory.setfChassisTwo(children.get(3).text());
break;
case 21:
// 底盘3 底盘4
autobatchDirectory.setfChassisThree(children.get(1).text());
autobatchDirectory.setfChassisFour(children.get(3).text());
break;
default:
break;
}
} /**
* JDBC存储
* @throws SQLException
*/
public static void saveByJdbc(AutobatchDirectory autobatchDirectory) throws SQLException{
String sql = "insert into autobatch_directory ("
+ "F_ANNOU_TYPE, F_ANNOU_BATCH, F_VEHICLE_BRAND, F_VEHICLE_TYPE,"
+ "F_MAX_MASS, F_TOTAL_MASS, F_WHOLE_MASS, F_FUEL_TYPE, "
+ "F_BLOWOFF_STANDARD, F_AXLE_NUMBER, F_WHEELBASE, F_AXLE_WEIGHT,"
+ "F_SPRING_NUMBER, F_TYRE_NUMBER,F_TYRE_SIZE, F_DEPARTURE_ANGLE, "
+ "F_FREAR_SUSPENSION, F_FRONT_GAUGE, F_BACK_GAUGE, F_VIN_CODE, "
+ "F_VEHICLE_LENGTH, F_VEHICLE_WIDTH, F_VEHICLE_HEIGHT,F_CARGO_LENGTH,"
+ "F_CARGO_WIDTH, F_CARGO_HEIGHT,F_MAX_SPEED, F_MAX_PASSENGER,"
+ "F_CAB_NUMBER, F_STEERING_TYPE, F_TOTAL_MASS_TRAILER,"
+ "F_LOAD_MASS_FACTOR, F_MAX_SEMITRAILER, F_ENTERPRISE_NAME, F_ENTERPRISE_ADDRESS, "
+ "F_ENTERPRISE_PHONE, F_ENTERPRISE_FAX, F_POSTCODE, F_CHASSIS_ONE,"
+ "F_CHASSIS_TWO, F_CHASSIS_THREE, F_CHASSIS_FOUR, F_ENGINE_TYPE,"
+ "F_ENGINE_PRO, F_ENGINE_TRADEMARK, F_OUTPUT_VOLUME, F_POWER, F_REMARK) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,"
+ "?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)";
Connection conn = MycatJdbc.getConnection();
PreparedStatement pstm = null;
try {
System.out.println(sql);
pstm = (PreparedStatement) conn.prepareStatement(sql);
pstm.setString(1, autobatchDirectory.getfAnnouType());
pstm.setString(2, autobatchDirectory.getfAnnouBatch());
pstm.setString(3, autobatchDirectory.getfVehicleBrand());
pstm.setString(4, autobatchDirectory.getfVehicleType());
pstm.setString(5, autobatchDirectory.getfMaxMass());
pstm.setString(6, autobatchDirectory.getfTotalMass());
pstm.setString(7, autobatchDirectory.getfWholeMass());
pstm.setString(8, autobatchDirectory.getfFuelType());
pstm.setString(9, autobatchDirectory.getfBlowoffStandard());
pstm.setString(10, autobatchDirectory.getfAxleNumber());
pstm.setString(11, autobatchDirectory.getfWheelbase());
pstm.setString(12,autobatchDirectory.getfAxleWeight() );
pstm.setString(13, autobatchDirectory.getfSpringNumber());
pstm.setString(14,autobatchDirectory.getfTyreNumber() );
pstm.setString(15, autobatchDirectory.getfTyreSize() );
pstm.setString(16, autobatchDirectory.getfDepartureAngle());
pstm.setString(17, autobatchDirectory.getfFrearSuspension());
pstm.setString(18, autobatchDirectory.getfFrontGauge());
pstm.setString(19,autobatchDirectory.getfBackGauge() );
pstm.setString(20, autobatchDirectory.getfVinCode());
pstm.setString(21, autobatchDirectory.getfVehicleLength());
pstm.setString(22, autobatchDirectory.getfVehicleWidth());
pstm.setString(23, autobatchDirectory.getfVehicleHeight());
pstm.setString(24, autobatchDirectory.getfCargoLength());
pstm.setString(25,autobatchDirectory.getfCargoWidth() );
pstm.setString(26, autobatchDirectory.getfCargoHeight());
pstm.setString(27,autobatchDirectory.getfMaxSpeed() );
pstm.setString(28,autobatchDirectory.getfMaxPassenger() );
pstm.setString(29,autobatchDirectory.getfCabNumber() );
pstm.setString(30, autobatchDirectory.getfSteeringType() );
pstm.setString(31, autobatchDirectory.getfTotalMassTrailer());
pstm.setString(32,autobatchDirectory.getfLoadMassFactor() );
pstm.setString(33,autobatchDirectory.getfMaxSemitrailer() );
pstm.setString(34, autobatchDirectory.getfEnterpriseName());
pstm.setString(35,autobatchDirectory.getfEnterpriseAddress() );
pstm.setString(36,autobatchDirectory.getfEnterprisePhone() );
pstm.setString(37,autobatchDirectory.getfEnterpriseFax() );
pstm.setString(38, autobatchDirectory.getfPostcode() );
pstm.setString(39, autobatchDirectory.getfChassisOne());
pstm.setString(40, autobatchDirectory.getfChassisTwo());
pstm.setString(41, autobatchDirectory.getfChassisThree());
pstm.setString(42, autobatchDirectory.getfChassisFour());
pstm.setString(43, autobatchDirectory.getfEngineType() == null ? "" : autobatchDirectory.getfEngineType());
pstm.setString(44, autobatchDirectory.getfEnginePro() == null ? "" : autobatchDirectory.getfEnginePro());
pstm.setString(45, autobatchDirectory.getfEngineTrademark() == null ? "" : autobatchDirectory.getfEngineTrademark());
pstm.setString(46, autobatchDirectory.getfOutputVolume() == null ? "" : autobatchDirectory.getfOutputVolume());
pstm.setString(47, autobatchDirectory.getfPower() == null ? "" : autobatchDirectory.getfPower());
pstm.setString(48, autobatchDirectory.getfRemark()==null?"":autobatchDirectory.getfRemark());
pstm.executeUpdate();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (pstm != null) {
try {
pstm.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
}

代码没什么难度,都是基本的元素解析。

Java Jsoup Spider抓取数据入库的更多相关文章

  1. java抓取网页数据,登录之后抓取数据。

    最近做了一个从网络上抓取数据的一个小程序.主要关于信贷方面,收集的一些黑名单网站,从该网站上抓取到自己系统中. 也找了一些资料,觉得没有一个很好的,全面的例子.因此在这里做个笔记提醒自己. 首先需要一 ...

  2. Java模拟新浪微博登陆抓取数据

    前言:  兄弟们来了来了,最近有人在问如何模拟新浪微博登陆抓取数据,我听后默默地抽了一口老烟,暗暗的对自己说,老汉是时候该你出场了,所以今天有时间就整理整理,浅谈一二. 首先:  要想登陆新浪微博需要 ...

  3. 通过Java进行网页抓取并生成plist创建代码

    抓取网页的方法: 抓取网页可以通过正则表达式也可以通过Java. 通过firefox浏览器,安装Firebug来查看网页的源代码. 首先将要抓取的部分保存到本地,步骤如下: 1.在要抓取的位置右键,选 ...

  4. .net处理页面的抓取数据

    //要抓取数据的页面路径 string url = "http://www.scedu.net/banshi/used-car/lower-secondary-education/middl ...

  5. windows环境下nutch2.x 在eclipse中实现抓取数据存进mysql详细步骤

    nutch2.x 在eclipse中实现抓取数据存进mysql步骤 最近在研究nutch,花了几天时间,也遇到很多问题,最终结果还是成功了,在此记录,并给其他有兴趣的人提供参考,共同进步. 对nutc ...

  6. java做web抓取

    就像许多现代科技一样,从网站提取信息这一功能也有多个框架可以选择.最流行的有JSoup.HTMLUnit和Selenium WebDriver.我们这篇文章讨论JSoup.JSoup是个开源项目,提供 ...

  7. 分布式爬虫:使用Scrapy抓取数据

    分布式爬虫:使用Scrapy抓取数据 Scrapy是Python开发的一个快速,高层次的屏幕抓取和web抓取框架,用于抓取web站点并从页面中提取结构化的数据.Scrapy用途广泛,可以用于数据挖掘. ...

  8. C# 从需要登录的网站上抓取数据

    [转] C# 从需要登录的网站上抓取数据 背景:昨天一个学金融的同学让我帮她从一个网站上抓取数据,然后导出到excel,粗略看了下有1000+条记录,人工统计的话确实不可能.虽说不会,但作为一个学计算 ...

  9. nodejs--实现跨域抓取数据

    最近公司安排给我一个任务,抓取页面数据:http://survey.finance.sina.com.cn/static/20205/20131120.html?pid=20205&dpc=1 ...

随机推荐

  1. Codeforces Round #321 (Div. 2) C dfs处理(双向边叶子节点的判断)

    C. Kefa and Park time limit per test 2 seconds memory limit per test 256 megabytes input standard in ...

  2. $.extend(object) 和 $.fn.extend(object)

    1.jQuery.extend(object); 它是为jQuery类添加类方法,可以理解为添加静态方法.如: jQuery.extend({ min: function(a, b) { return ...

  3. 《Linux命令行与shell脚本编程大全 第3版》Linux命令行---25

    以下为阅读<Linux命令行与shell脚本编程大全 第3版>的读书笔记,为了方便记录,特地与书的内容保持同步,特意做成一节一次随笔,特记录如下:

  4. formal parameter

    formal parameter : [3.16] object declared as part of a function declaration or definition that acqui ...

  5. 案子前申請 EVB board (Evaluation Board)

    在跑案子前, 需向各 component vendor 申請 EVB board, 其中也包含 mosfet , 以利做實驗, spec 有可能會寫錯 或不清楚, 所以需要使用 EVB board 檢 ...

  6. vmware tools安装过程

    每次通过vmware安装Ubuntu的时候,总是会多多少少出点问题.好容易披荆斩棘把镜像安好了,然而屏幕却只有小小一个,不能显示大屏,我就知道肯定是缺少了vmware tools.于是点击左上方菜单中 ...

  7. JavaScript-性能优化,函数节流(throttle)与函数去抖(debounce)

    我在写一个类似百度搜索框的自动提示功能时候,使用了AJAX+keydown事件.调试时候我发现,当在搜索框中输入文字的时候,控制台在不停发送AJAX.这在本地服务器测试还好,如果我把它拿到运行环境,很 ...

  8. AC日记——[ZJOI2009]狼和羊的故事 bzoj 1412

    1412 思路: 最小割: 狼作为一个点集a,空领地作为点集b,羊作为点集c: s向a连边,c向t连边,a向b连边,b向b连边,b向c连边: 如何理解最小割? a,c之间割掉最少的路径(栅栏)使其没有 ...

  9. [Machine Learning with Python] Data Preparation through Transformation Pipeline

    In the former article "Data Preparation by Pandas and Scikit-Learn", we discussed about a ...

  10. NIO2.0之copy、delete和move

    转自:http://www.importnew.com/15884.html Java 7引入了NIO.2,NIO.2是继承自NIO框架,并增加了新的功能(例如:处理软链接和硬链接的功能).这篇帖子包 ...