一、代码演示

如果中途中断,可进行刷选过滤已拉取省份数据

/**
* TODO
*
* @author kevin
* @createTime 2019-11-18 19:37
*/
@RestController
public class CityController { @Autowired
private ProvinceService provinceService;
@Autowired
private HttpUtil httpUtil;
private String yearHref = "";
private int index; // {"provincetr", "citytr", "countytr", "towntr", "villagetr"};
@GetMapping("/start")
public ResultTemplate<String> spider() throws Exception {
String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/";
String charset = "gb2312";
Document rootDoc = httpUtil.get(url, charset); if (rootDoc == null) {
return of("fail");
}
Element firstElement = rootDoc.getElementsByClass("center_list_contlist").get(0);
// http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html
yearHref = firstElement.select("a").get(0).attr("href"); // 最近一个年份的省份链接
Document doc = httpUtil.get(yearHref, charset);
// 遍历所有的省
Elements provinceElements = doc.getElementsByClass("provincetr");
for (Element element : provinceElements) {
Elements aEles = element.select("a");
for (Element aEle : aEles) {
String name = aEle.text();
// 11.html
String provincesHref = aEle.attr("href");
String code = provincesHref.substring(0, provincesHref.indexOf("."));
index = yearHref.lastIndexOf("/") + 1;
// http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/11.html
provincesHref = yearHref.substring(0, index) + provincesHref;
DicProvince province = new DicProvince()
.setProvinceName(name)
.setProvinceCode(code)
.setCountryId(1196612453660643329L)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
if ("北京市".equals(name) || "天津市".equals(name) || "河北省".equals(name)) {
System.out.println("未执行市:" + name);
} else {
System.out.println("开始时间:" + LocalDateTime.now());
System.out.println("省名称:" + name);
Long id = provinceService.insertProvince(province);
getCites(provincesHref, charset, id);
}
}
}
return of("spider crawl end.");
} private void getCites(String url, String charset, Long provinceId) throws Exception {
Document rootDoc = null;
int i = 0;
while (rootDoc == null) {
try {
i++;
if (i >= 3) {
System.out.println("循环次数:" + i);
}
rootDoc = httpUtil.get(url, charset);
} catch (Exception e) {
rootDoc = null;
System.out.println("请求网页链接报错");
}
}
i = 0;
if (rootDoc != null) {
Elements cityElements = rootDoc.getElementsByClass("citytr");
for (Element cityElement : cityElements) {
Element aEle = cityElement.select("a").get(1); // 第二个是市的名字
String name = aEle.text();
// 11/1101.html
String cityHref = aEle.attr("href");
int start = cityHref.lastIndexOf("/") + 1;
String code = cityHref.substring(start, cityHref.indexOf("."));
cityHref = yearHref.substring(0, index) + cityHref;
DicCity city = new DicCity()
.setCityName(name)
.setCityCode(code)
.setProvinceId(provinceId)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
Long id = provinceService.insertCity(city);
//Long id=1L; getDistrict(cityHref, charset, id);
}
}
} // 区县
private void getDistrict(String url, String charset, Long idDis) throws Exception {
Document rootDoc = null;
int i = 0;
while (rootDoc == null) {
try {
i++;
if (i >= 3) {
System.out.println("循环次数:" + i);
}
rootDoc = httpUtil.get(url, charset);
} catch (Exception e) {
rootDoc = null;
System.out.println("请求网页链接报错");
}
}
i = 0;
if (rootDoc != null) {
Elements cityElements = rootDoc.getElementsByClass("countytr");
for (Element cityElement : cityElements) {
try {
Element aEle = cityElement.select("a").get(1);
String name = aEle.text();
String cityHref = aEle.attr("href");
int start = cityHref.lastIndexOf("/") + 1;
String code = cityHref.substring(start, cityHref.indexOf(".")); int index = url.lastIndexOf("/") + 1;
cityHref = url.substring(0, index) + cityHref; DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
Long id = provinceService.insertDistrict(district);
//Long id=1L;
getStreet(cityHref, charset, id);
} catch (Exception e) {
System.out.println("市辖区");
Element aEle = cityElement.select("td").get(0);
String code = aEle.text(); Element aEle2 = cityElement.select("td").get(1);
String name = aEle2.text(); DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis);
Long id = provinceService.insertDistrict(district);
System.out.println("执行完毕"); } }
}
} // 街道
private void getStreet(String url, String charset, Long idStr) throws Exception {
Document rootDoc = null;
int i = 0;
while (rootDoc == null) {
try {
i++;
if (i >= 3) {
System.out.println("循环次数:" + i);
}
rootDoc = httpUtil.get(url, charset);
} catch (Exception e) {
rootDoc = null;
System.out.println("请求网页链接报错");
}
}
i = 0;
if (rootDoc != null) {
Elements cityElements = rootDoc.getElementsByClass("towntr");
for (Element cityElement : cityElements) {
Element aEle = cityElement.select("a").get(1); // 第二个是市的名字
String name = aEle.text();
String cityHref = aEle.attr("href");
int start = cityHref.lastIndexOf("/") + 1;
String code = cityHref.substring(start, cityHref.indexOf("."));
int index = url.lastIndexOf("/") + 1;
cityHref = url.substring(0, index) + cityHref;
DicStreet street = new DicStreet()
.setStreetName(name)
.setStreetCode(code)
.setDistrictId(idStr)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
Long id = provinceService.insertStreet(street);
//Long id=1L;
getCommunity(cityHref, charset, id);
}
}
} // 社区
private void getCommunity(String url, String charset, Long idPro) throws Exception {
Document rootDoc = null;
int i = 0;
while (rootDoc == null) {
try {
i++;
if (i >= 3) {
System.out.println("循环次数:" + i);
}
rootDoc = httpUtil.get(url, charset);
} catch (Exception e) {
rootDoc = null;
System.out.println("请求网页链接报错");
}
}
i = 0;
if (rootDoc != null) {
Elements cityElements = rootDoc.getElementsByClass("villagetr");
for (Element cityElement : cityElements) {
Element aEle = cityElement.select("td").get(0);
String code = aEle.text(); Element aEle2 = cityElement.select("td").get(1);
String cl_code = aEle2.text(); Element aEle3 = cityElement.select("td").get(2);
String name = aEle3.text(); DicCommunity community = new DicCommunity()
.setCommunityName(name)
.setCommunityCode(code)
.setClassificationCode(cl_code)
.setStreetId(idPro)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
Long id = provinceService.insertCommunity(community);
}
}
} }

二、HttppUtil工具类

/**
* TODO
*
* @author kevin
* @createTime 2019-11-20 9:17
*/
@Component
public class HttpUtil {
public Document get(String url, String charset) throws IOException {
String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36";
URL url2 = new URL(url);
HttpURLConnection connection = (HttpURLConnection)url2.openConnection();
connection.setRequestMethod("GET");
//是否允许缓存,默认true。
connection.setUseCaches(Boolean.FALSE);
//设置请求头信息
connection.addRequestProperty("Connection", "close");
connection.addRequestProperty("user-agent", userAgent);
//设置连接主机超时(单位:毫秒)
connection.setConnectTimeout(80000);
//设置从主机读取数据超时(单位:毫秒)
connection.setReadTimeout(80000);
//开始请求
try {
Document doc = Jsoup.parse(connection.getInputStream(), charset, url);
return doc;
} catch (Exception e) {
System.out.println("parse error: " + url);
}
return null;
} }

三、service部分,根据需要自行定义数据库表

/**
* TODO
*
* @author kevin
* @createTime 2019-11-18 20:41
*/
@Service
public class ProvinceServiceImpl implements ProvinceService { @Autowired
private ProvinceMapper provinceMapper;
@Autowired
private CityMapper cityMapper;
@Autowired
private DistrictMapper districtMapper;
@Autowired
private StreetMapper streetMapper;
@Autowired
private CommunityMapper communityMapper; @Override
public Long insertProvince(DicProvince dicProvince) {
int res=0;
while (res!=1){
try {
res=provinceMapper.insert(dicProvince);
} catch (Exception e) {
res=0;
System.out.println("插入省数据失败");
e.printStackTrace();
}
}
return dicProvince.getProvinceId();
} @Override
public Long insertCity(DicCity dicCity) {
int res=0;
while(res!=1){
try {
res=cityMapper.insert(dicCity);
} catch (Exception e) {
res=0;
System.out.println("插入市数据失败");
e.printStackTrace();
}
}
return dicCity.getCityId();
} @Override
public Long insertDistrict(DicDistrict dicDistrict) {
int res=0;
while (res!=1){
try {
res=districtMapper.insert(dicDistrict);
} catch (Exception e) {
res=0;
System.out.println("插入区县数据失败");
e.printStackTrace();
}
}
return dicDistrict.getDistrictId();
} @Override
public Long insertStreet(DicStreet dicStreet) {
int res=0;
while (res!=1){
try {
res=streetMapper.insert(dicStreet);
} catch (Exception e) {
res=0;
System.out.println("插入街道数据失败");
e.printStackTrace();
}
}
return dicStreet.getStreetId();
} @Override
public Long insertCommunity(DicCommunity dicCommunity) {
int res=0;
while (res!=1){
try {
res=communityMapper.insert(dicCommunity);
} catch (Exception e) {
res=0;
System.out.println("插入社区数据失败");
e.printStackTrace();
}
}
return dicCommunity.getCommunityId();
} }

  

基于【 springBoot+jsoup】一 || 爬取全国行政区划数据的更多相关文章

  1. Java使用Jsoup之爬取博客数据应用实例

    导入Maven依赖 <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <g ...

  2. python爬虫学习之爬取全国各省市县级城市邮政编码

    实例需求:运用python语言在http://www.ip138.com/post/网站爬取全国各个省市县级城市的邮政编码,并且保存在excel文件中 实例环境:python3.7 requests库 ...

  3. Python爬取招聘网站数据,给学习、求职一点参考

    1.项目背景 随着科技的飞速发展,数据呈现爆发式的增长,任何人都摆脱不了与数据打交道,社会对于“数据”方面的人才需求也在不断增大.因此了解当下企业究竟需要招聘什么样的人才?需要什么样的技能?不管是对于 ...

  4. Java实现爬取京东手机数据

    Java实现爬取京东手机数据 最近看了某马的Java爬虫视频,看完后自己上手操作了下,基本达到了爬数据的要求,HTML页面源码也刚好复习了下,之前发布两篇关于简单爬虫的文章,也刚好用得上.项目没什么太 ...

  5. Java爬取同花顺股票数据(附源码)

    最近有小伙伴问我能不能抓取同花顺的数据,最近股票行情还不错,想把数据抓下来自己分析分析.我大A股,大家都知道的,一个概念火了,相应的股票就都大涨. 如果能及时获取股票涨跌信息,那就能在刚开始火起来的时 ...

  6. Scrapy 通过登录的方式爬取豆瓣影评数据

    Scrapy 通过登录的方式爬取豆瓣影评数据 爬虫 Scrapy 豆瓣 Fly 由于需要爬取影评数据在来做分析,就选择了豆瓣影评来抓取数据,工具使用的是Scrapy工具来实现.scrapy工具使用起来 ...

  7. selenium跳过webdriver检测并爬取天猫商品数据

    目录 简介 编写思路 使用教程 演示图片 源代码 @(文章目录) 简介 现在爬取淘宝,天猫商品数据都是需要首先进行登录的.上一节我们已经完成了模拟登录淘宝的步骤,所以在此不详细讲如何模拟登录淘宝.把关 ...

  8. 【scrapy_redis】调试后爬取了部分数据,然后重新调试时,直接被去重机制过滤掉无法重头开始爬取

    这2天遇到一个问题,之前调试的时候爬取了一些数据,结果第二天重新调试的时候发现爬虫很快结束,而且还没有报错.后来从日志里看到这个: no more duplicates will be shown ( ...

  9. node 爬虫 --- 将爬取到的数据,保存到 mysql 数据库中

    步骤一:安装必要模块 (1)cheerio模块 ,一个类似jQuery的选择器模块,分析HTML利器. (2)request模块,让http请求变的更加简单 (3)mysql模块,node连接mysq ...

随机推荐

  1. [转]EXCEL截取字符串中某几位的函数——LeftMIDRight及Find函数的使用

    原文地址:http://blog.sina.com.cn/s/blog_3f136a180102ymq5.html EXCEL截取字符串中某几位的函数 ——Left MID Right及Find函数的 ...

  2. PHP curl put方式上传文件

    发送端: <?php function curlPut($destUrl, $sourceFileDir, $headerArr = array(), $timeout = ) { $ch = ...

  3. Java基础 do-while 简单示例

        JDK :OpenJDK-11      OS :CentOS 7.6.1810      IDE :Eclipse 2019‑03 typesetting :Markdown   code ...

  4. jmeter中特殊的时间处理方式

    需求: 1.获取当前时间的年月日时分秒毫秒 2.生成上一个月的随机某天的一个时间 3.生成一个年月日时分秒毫秒的一个时间戳 1.__time : 获取时间戳.格式化时间 ${__time(yyyy-M ...

  5. Eclipse安装中文简体语言包

    原文地址:https://blog.csdn.net/qq_41101213/article/details/84405452 方法一:1. 下载中文语言包: 1.1 打开网址:http://www. ...

  6. linux配置docker报错:ImportError: No module named yum

    如题,安装docker后配置仓库报错: [root@centos ~]# yum-config-manager --add-repo https://download.docker.com/linux ...

  7. ByteBuf使用实例

    之前我们有个netty5的拆包解决方案(参加netty5拆包问题解决实例),现在我们采用另一种思路,不需要新增LengthFieldBasedFrameDecoder,直接修改NettyMessage ...

  8. ES6深入浅出-5 新版对象-1.如何创建对象

    对象属性的加强: 可以通过new Object(), Object.create()方法,或者使用字面量标记(初始化标记)初始化对象. 一个对象初始化器,由花括号/大括号 ({}) 包含的一个由零个或 ...

  9. 【Tomcat】Tomcat 基本使用(二)

    上一章介绍了Tomcat原理[Tomcat]Tomcat 原理架构(一),本章介绍Tomcat的基本使用 Tomcat端口设置 tomcat端口设置,在tomcat的配置文件目录下的server.xm ...

  10. 123457123456#0#----com.ppGame.HappyShuXue54--前拼后广--儿童数学_pp

    com.ppGame.HappyShuXue54--前拼后广--儿童数学_pp