[Java] 数据分析--数据预处理
数据结构
- 键-值对:HashMap

1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.HashMap;
4 import java.util.Scanner;
5
6 public class HashMapExample {
7 public static void main(String[] args) {
8 File dataFile = new File("data/Countries.dat");
9 HashMap<String,Integer> dataset = new HashMap();
10 try {
11 Scanner input = new Scanner(dataFile);
12 while (input.hasNext()) {
13 String country = input.next();
14 int population = input.nextInt();
15 dataset.put(country, population);
16 }
17 } catch (FileNotFoundException e) {
18 System.out.println(e);
19 }
20 System.out.printf("dataset.size(): %d%n", dataset.size());
21 System.out.printf("dataset.get(\"Peru\"): %,d%n", dataset.get("Peru"));
22 }
23 }
文件处理
- csv文件
- 将Map数据存入csv文件

1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.io.FileOutputStream;
4 import java.io.IOException;
5 import java.util.Map;
6 import java.util.Scanner;
7 import java.util.Set;
8 import java.util.TreeMap;
9 import org.apache.poi.hssf.usermodel.HSSFRow;
10 import org.apache.poi.hssf.usermodel.HSSFSheet;
11 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
12
13 public class FromMapToExcel {
14 public static void main(String[] args) {
15 Map<String,Integer> map = new TreeMap();
16 load(map, "data/Countries.dat");
17 print(map);
18 storeXL(map, "data/Countries.xls", "Countries Worksheet");
19 }
20
21 /** Loads the data from the specified file into the specified map.
22 */
23 public static void load(Map map, String fileSpec) {
24 File file = new File(fileSpec);
25 try {
26 Scanner input = new Scanner(file);
27 while (input.hasNext()) {
28 String country = input.next();
29 int population = input.nextInt();
30 map.put(country, population);
31 }
32 } catch (FileNotFoundException e) {
33 System.out.println(e);
34 }
35 }
36
37 public static void print(Map map) {
38 Set countries = map.keySet();
39 for (Object country : countries) {
40 Object population = map.get(country);
41 System.out.printf("%-10s%,12d%n", country, population);
42 }
43 }
44
45 /** Stores the specified map in the specified worksheet of
46 the specified Excel workbook file.
47 * @param map
48 * @param fileSpec
49 * @param sheet
50 */
51 public static void storeXL(Map map, String fileSpec, String sheet) {
52 try {
53 FileOutputStream out = new FileOutputStream(fileSpec);
54 HSSFWorkbook workbook = new HSSFWorkbook();
55 HSSFSheet worksheet = workbook.createSheet(sheet);
56 Set countries = map.keySet();
57 short rowNum = 0;
58 for (Object country : countries) {
59 Object population = map.get(country);
60 HSSFRow row = worksheet.createRow(rowNum);
61 row.createCell(0).setCellValue((String)country);
62 row.createCell(1).setCellValue((Integer)population);
63 ++rowNum;
64 }
65 workbook.write(out);
66 out.flush();
67 out.close();
68 } catch (FileNotFoundException e) {
69 System.err.println(e);
70 } catch (IOException e) {
71 System.err.println(e);
72 }
73 }
74 }
- 读取csv文件

1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.HashMap;
4 import java.util.Scanner;
5
6 public class ReadingCSVFiles {
7 public static void main(String[] args) {
8 File dataFile = new File("data/Countries.csv");
9 try {
10 Scanner input = new Scanner(dataFile);
11 input.useDelimiter(",|\\s");
12 String column1 = input.next();
13 String column2 = input.next();
14 System.out.printf("%-10s%12s%n", column1, column2);
15 while (input.hasNext()) {
16 String country = input.next();
17 int population = input.nextInt();
18 System.out.printf("%-10s%,12d%n", country, population);
19 }
20 } catch (FileNotFoundException e) {
21 System.out.println(e);
22 }
23 }
24 }
- 读取csv到Map

1 import static dawj.ch02.FromMapToExcel.print;
2 import java.io.FileInputStream;
3 import java.io.FileNotFoundException;
4 import java.io.IOException;
5 import java.util.Map;
6 import java.util.TreeMap;
7 import org.apache.poi.hssf.usermodel.HSSFCell;
8 import org.apache.poi.hssf.usermodel.HSSFRow;
9 import org.apache.poi.hssf.usermodel.HSSFSheet;
10 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
11 import org.apache.poi.ss.usermodel.DataFormatter;
12 import org.apache.poi.ss.usermodel.Row;
13
14 public class FromExcelToMap {
15 public static void main(String[] args) {
16 Map map = loadXL("data/Countries.xls", "Countries Worksheet");
17 print(map);
18 }
19
20 /** Returns a Map object containing the data from the specified
21 worksheet in the specified Excel file.
22 */
23 public static Map loadXL(String fileSpec, String sheetName) {
24 Map<String,Integer> map = new TreeMap();
25 try {
26 FileInputStream stream = new FileInputStream(fileSpec);
27 HSSFWorkbook workbook = new HSSFWorkbook(stream);
28 HSSFSheet worksheet = workbook.getSheet(sheetName);
29 DataFormatter formatter = new DataFormatter();
30 for (Row row : worksheet) {
31 HSSFRow hssfRow = (HSSFRow)row;
32 HSSFCell cell = hssfRow.getCell(0);
33 String country = cell.getStringCellValue();
34 cell = hssfRow.getCell(1);
35 String str = formatter.formatCellValue(cell);
36 int population = (int)Integer.getInteger(str);
37 map.put(country, population);
38 }
39 } catch (FileNotFoundException e) {
40 System.err.println(e);
41 } catch (IOException e) {
42 System.err.println(e);
43 }
44 return map;
45 }
46 }
- 解析JSON文件

1 import java.io.File;
2 import java.io.FileInputStream;
3 import java.io.FileNotFoundException;
4 import java.io.IOException;
5 import java.io.InputStream;
6 import java.util.ArrayList;
7 import java.util.HashMap;
8 import javax.json.Json;
9 import javax.json.stream.JsonParser;
10 import javax.json.stream.JsonParser.Event;
11
12 public class ParsingJSONFiles {
13 public static void main(String[] args) {
14 File dataFile = new File("data/Books.json");
15 try {
16 InputStream stream = new FileInputStream(dataFile);
17 JsonParser parser = Json.createParser(stream);
18 Event event = parser.next(); // advance past START_OBJECT
19 HashMap<String,Object> map = getMap(parser);
20 System.out.println(map);
21 stream.close();
22 } catch (FileNotFoundException e) {
23 System.out.println(e);
24 } catch (IOException e) {
25 System.out.println(e);
26 }
27 }
28
29 /* Returns the HashMap parsed by the specified parser.
30 Called when event.equals(event.START_OBJECT):
31 */
32 public static HashMap getMap(JsonParser parser) {
33 HashMap<String,Object> map = new HashMap();
34 Event event = parser.next(); // advance past START_OBJECT
35 String key = parser.getString();
36 event = parser.next(); // advance past KEY_NAME
37 while (!event.equals(Event.END_OBJECT)) {
38 if (event.equals(Event.VALUE_STRING)) {
39 String value = parser.getString();
40 map.put(key, value);
41 } else if (event.equals(Event.VALUE_NUMBER)) {
42 Integer value = parser.getInt();
43 map.put(key, value);
44 } else if (event.equals(Event.START_ARRAY)) {
45 ArrayList<String> list = getList(parser);
46 map.put(key, list);
47 }
48 event = parser.next();
49 if (event.equals(Event.END_OBJECT)) {
50 break;
51 }
52 key = parser.getString();
53 event = parser.next();
54 }
55 return map;
56 }
57
58 /* Returns the ArrayList parsed by the specified parser.
59 Called when event.equals(event.START_ARRAY):
60 */
61 public static ArrayList getList(JsonParser parser) {
62 ArrayList list = new ArrayList();
63 Event event = parser.next(); // advance past START_ARRAY
64 while (!event.equals(Event.END_ARRAY)) {
65 if (event.equals(Event.VALUE_STRING)) {
66 list.add(parser.getString());
67 event = parser.next();
68 } else if (event.equals(Event.START_OBJECT)) {
69 HashMap<String,Object> map = getMap(parser);
70 list.add(map);
71 event = parser.next();
72 } else if (event.equals(Event.START_ARRAY)) {
73 ArrayList subList = getList(parser); // recursion
74 list.add(subList);
75 event = parser.next();
76 }
77 }
78 return list;
79 }
80 }
数据处理
- 生成测试数据集

1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.io.PrintWriter;
4 import java.util.Random;
5
6 public class GeneratingTestData {
7 private static final int ROWS = 8, COLS = 5;
8 private static final Random RANDOM = new Random();
9
10 public static void main(String[] args) {
11 File outputFile = new File("data/Output.csv");
12 try {
13 PrintWriter writer = new PrintWriter(outputFile);
14 for (int i = 0; i < ROWS; i++) {
15 for (int j = 0; j < COLS-1; j++) {
16 writer.printf("%.6f,", RANDOM.nextDouble());
17 }
18 writer.printf("%.6f%n", RANDOM.nextDouble());
19 }
20 writer.close();
21 } catch (FileNotFoundException e) {
22 System.err.println(e);
23 }
24 }
25 }
- 数据过滤
- 需求:选择国土面积超过100万平米的内陆国家
- 过程:数据为dat格式,先定义对应简单类country,再写程序将dat中数据存在country的Set中,最后做筛选
Country.java

1 import java.util.HashSet;
2 import java.util.Scanner;
3
4 class Country {
5 protected String name;
6 protected int population;
7 protected int area;
8 protected boolean landlocked;
9
10 /* Constructs a new Country object from the next line being scanned.
11 If there are no more lines, the new object's fields are left null.
12 */
13 public Country(Scanner in) {
14 if (in.hasNextLine()) {
15 this.name = in.next();
16 this.population = in.nextInt();
17 this.area = in.nextInt();
18 this.landlocked = in.nextBoolean();
19 }
20 }
21
22 @Override
23 public String toString() {
24 return String.format("%-10s %,12d %,12d %b",
25 name, population, area, landlocked);
26 }
27 }
FilteringData.java

1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.HashSet;
4 import java.util.Scanner;
5 import java.util.Set;
6 import java.util.TreeMap;
7
8 public class FilteringData {
9 private static final int MIN_AREA = 1000000; // one million
10 public static void main(String[] args) {
11 File file = new File("data/Countries.dat");
12 Set<Country> dataset = readDataset(file);
13
14 for (Country country : dataset) {
15 if (country.landlocked && country.area >= MIN_AREA) {
16 System.out.println(country);
17 }
18 }
19 }
20
21 public static Set readDataset(File file) {
22 Set<Country> set = new HashSet();
23 try {
24 Scanner input = new Scanner(file);
25 input.nextLine(); // read past headers
26 while (input.hasNextLine()) {
27 set.add(new Country(input));
28 }
29 input.close();
30 } catch (FileNotFoundException e) {
31 System.out.println(e);
32 }
33 return set;
34 }
35 }
- 排序
- 需求:将contries.dat中数据按population进行排序
- 实现:将数据存入TreeMap
- 注意:关键字段必须唯一,即两个国家人口不能相同

1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.Collections;
4 import java.util.HashMap;
5 import java.util.Scanner;
6 import java.util.Set;
7 import java.util.TreeMap;
8
9 public class SortingData {
10 public static void main(String[] args) {
11 File file = new File("src/main/java/com/hongfeng/SortingData/Countries.dat");
12 TreeMap<Integer,String> dataset = new TreeMap();
13 try {
14 Scanner input = new Scanner(file);
15 while (input.hasNext()) {
16 String x = input.next();
17 int y = input.nextInt();
18 dataset.put(y, x);
19 }
20 input.close();
21 } catch (FileNotFoundException e) {
22 System.out.println(e);
23 }
24 print(dataset);
25 }
26
27 public static void print(TreeMap<Integer,String> map) {
28 for (Integer key : map.keySet()) {
29 System.out.printf("%,12d %-16s%n", key, map.get(key));
30 }
31 }
32 }
- 合并
- 需求:将多个排好序的文件合并为单个排好序的文件
- country类继承Comparable,定义从文件创建对象的构造方法,以及比较方法
- 扫描两个文件,比较,存入新文件,一个文件扫描完后,另一个文件逐项扫描即可
Country.java

1 import java.util.Scanner;
2
3 class Country implements Comparable{
4 protected String name;
5 protected int population;
6
7 /* Constructs a new Country object from the next line being scanned.
8 If there are no more lines, the new object's fields are left null.
9 */
10 public Country(Scanner in) {
11 if (in.hasNextLine()) {
12 this.name = in.next();
13 this.population = in.nextInt();
14 }
15 }
16
17 public boolean isNull(){
18 return this.name == null;
19 }
20
21 @Override
22 public int compareTo(Object object){
23 Country that = (Country)object;
24 return this.population - that.population;
25 }
26
27 @Override
28 public String toString() {
29 return String.format("%-10s %,12d",
30 name, population);
31 }
32 }
MergingFiles

1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.io.PrintWriter;
4 import java.util.Scanner;
5
6 public class MergingFiles {
7 public static void main(String[] args) {
8 File inFile1 = new File("data/Countries1.dat");
9 File inFile2 = new File("data/Countries2.dat");
10 File outFile = new File("data/Countries.dat");
11 try {
12 Scanner in1 = new Scanner(inFile1);
13 Scanner in2 = new Scanner(inFile2);
14 PrintWriter out = new PrintWriter(outFile);
15 Country country1 = new Country(in1);
16 Country country2 = new Country(in2);
17 System.out.println(country1.hashCode());
18 System.out.println(country2.hashCode());
19 while (!country1.isNull() && !country2.isNull()) {
20 if (country1.compareTo(country2) < 0) {
21 out.println(country1);
22 country1 = new Country(in1);
23 } else {
24 out.println(country2);
25 country2 = new Country(in2);
26 }
27 }
28 while (!country1.isNull()) {
29 out.println(country1);
30 country1 = new Country(in1);
31 }
32 while (!country2.isNull()) {
33 out.println(country2);
34 country2 = new Country(in2);
35 }
36 in1.close();
37 in2.close();
38 out.close();
39 } catch (FileNotFoundException e) {
40 System.out.println(e);
41 }
42 }
43 }
[Java] 数据分析--数据预处理的更多相关文章
- pandas神器操作excel表格大全(数据分析数据预处理)
使用pandas库操作excel,csv表格操作大全 关注公众号"轻松学编程"了解更多,文末有公众号二维码,可以扫码关注哦. 前言 准备三份csv表格做演示: 成绩表.csv su ...
- [Java]数据分析--数据可视化
时间序列 需求:将一组字符顺序添加到时间序列中 实现:定义时间序列类TimeSeries,包含静态类Entry表示序列类中的各项,以及add,get,iterator,entry方法 TimeSeri ...
- 小白学 Python 数据分析(9):Pandas (八)数据预处理(2)
人生苦短,我用 Python 前文传送门: 小白学 Python 数据分析(1):数据分析基础 小白学 Python 数据分析(2):Pandas (一)概述 小白学 Python 数据分析(3):P ...
- 【新人赛】阿里云恶意程序检测 -- 实践记录10.20 - 数据预处理 / 训练数据分析 / TF-IDF模型调参
Colab连接与数据预处理 Colab连接方法见上一篇博客 数据预处理: import pandas as pd import pickle import numpy as np # 训练数据和测试数 ...
- EEGLAB数据分析:预处理与后续处理
来源:http://blog.sina.com.cn/s/blog_13171a73d0102v4zx.html 数据预处理主要包括数据导入.电极定位.电极返回.滤波.去除伪迹.重建参考.分段.叠加平 ...
- Java大数据应用领域及就业方向
最难毕业季,2017高校毕业生达到795万,许多学生面临着毕业即失业的尴尬.面对着与日俱增的竞争形势和就业压力,很多毕业生选择去知了堂学习社区镀金,以提高自己的就业竞争力,其中Java大数据是学生选择 ...
- Java大数据人才应用领域广,就业薪酬高
互联网创造了大数据应用的规模化环境,大数据应用成功的案例大都是在互联网上发生的, 互联网业务提供了数据,互联网企业开发了处理软件,互联网企业的创新带来了大数据应用 的活跃,没有互联网便没有今天的大数据 ...
- 数据准备<3>:数据预处理
数据预处理是指因为算法或者分析需要,对经过数据质量检查后的数据进行转换.衍生.规约等操作的过程.整个数据预处理工作主要包括五个方面内容:简单函数变换.标准化.衍生虚拟变量.离散化.降维.本文将作展开介 ...
- 【sklearn】数据预处理 sklearn.preprocessing
数据预处理 标准化 (Standardization) 规范化(Normalization) 二值化 分类特征编码 推定缺失数据 生成多项式特征 定制转换器 1. 标准化Standardization ...
随机推荐
- Tkinter系列教程01—引言和安装Tk—Python GUI编程
目录 Tkinter教程系列01--引言和安装Tk 引言 什么是Tkinter 安装 Tk 为 Windows 安装 Tk 验证是否安装正确 为 GNU/Linux 安装 Tk 使用 Linux 的包 ...
- [DFS]排列的生成
排列的生成 Time Limit:1000MS Memory Limit:65536K Total Submit:150 Accepted:95 Description 输出P(n,m)的排列(n,m ...
- Kafka 消息存储机制
Kafka 消息以 Partition 作为存储单元,那么在 Partition 内消息是以什么样的格式存储的呢,如何处理 Partition 中的消息,又有哪些安全策略来保证消息不会丢失呢,这一篇我 ...
- Dynamics CRM制作报表的时候让用户可以用自己的权限浏览数据
我们做SSRS报表的时候最头疼的问题就是用Sql查出来的数据都是全部数据没有做权限过滤,导致不同用户看到的数据是一样的. 确实Dynamics CRM产品的数据库时有对这个做处理的,其中每个实体都会有 ...
- Istio最佳实践系列:如何实现方法级调用跟踪?
赵化冰,腾讯云高级工程师,Istio Member,ServiceMesher 管理委员,Istio 项目贡献者,热衷于开源.网络和云计算.目前主要从事服务网格的开源和研发工作. 引言 TCM(Ten ...
- Linux 网络工具中的瑞士军刀 - socat & netcat
独立博客阅读:https://ryan4yin.space/posts/socat-netcat/ 文中的命令均在 macOS Big Sur 和 Opensuse Tumbleweed 上测试通过 ...
- java反射Array的使用
1.什么是Array Array是一个类的简写,全限定类名是java.lang.reflect.Array. 2.Array有什么用 Array可以代表所有的数组,可以通过Array动态创建与修改里面 ...
- k8s helm 安装etcd
待续 helm install etcd bitnami/etcd \ --set statefulset.replicaCount=3 \ --set persistence.enabled=tru ...
- OAuth 2.0、OIDC 原理
OAuth 目录 OAuth 什么是 OAuth? 为什么是 OAuth? SAML OAuth 和 API OAuth 主要组件 OAuth 作用域 OAuth 参与者 OAuth 令牌 OAuth ...
- 给HTML5 Video 设置多语言字幕文件
现在各种支持HTML5的浏览器都能够播放html5视频了,但是对于字幕的支持却很少,我们期待像DVD那样强大的字幕. 往往我们还不得不通过js来做,着实是一件痛苦的事情. 现在IE10率先对HTML5 ...