数据结构

  • 键-值对:HashMap

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.HashMap;
4 import java.util.Scanner;
5
6 public class HashMapExample {
7 public static void main(String[] args) {
8 File dataFile = new File("data/Countries.dat");
9 HashMap<String,Integer> dataset = new HashMap();
10 try {
11 Scanner input = new Scanner(dataFile);
12 while (input.hasNext()) {
13 String country = input.next();
14 int population = input.nextInt();
15 dataset.put(country, population);
16 }
17 } catch (FileNotFoundException e) {
18 System.out.println(e);
19 }
20 System.out.printf("dataset.size(): %d%n", dataset.size());
21 System.out.printf("dataset.get(\"Peru\"): %,d%n", dataset.get("Peru"));
22 }
23 }

文件处理

  • csv文件

    • 将Map数据存入csv文件  

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.io.FileOutputStream;
4 import java.io.IOException;
5 import java.util.Map;
6 import java.util.Scanner;
7 import java.util.Set;
8 import java.util.TreeMap;
9 import org.apache.poi.hssf.usermodel.HSSFRow;
10 import org.apache.poi.hssf.usermodel.HSSFSheet;
11 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
12
13 public class FromMapToExcel {
14 public static void main(String[] args) {
15 Map<String,Integer> map = new TreeMap();
16 load(map, "data/Countries.dat");
17 print(map);
18 storeXL(map, "data/Countries.xls", "Countries Worksheet");
19 }
20
21 /** Loads the data from the specified file into the specified map.
22 */
23 public static void load(Map map, String fileSpec) {
24 File file = new File(fileSpec);
25 try {
26 Scanner input = new Scanner(file);
27 while (input.hasNext()) {
28 String country = input.next();
29 int population = input.nextInt();
30 map.put(country, population);
31 }
32 } catch (FileNotFoundException e) {
33 System.out.println(e);
34 }
35 }
36
37 public static void print(Map map) {
38 Set countries = map.keySet();
39 for (Object country : countries) {
40 Object population = map.get(country);
41 System.out.printf("%-10s%,12d%n", country, population);
42 }
43 }
44
45 /** Stores the specified map in the specified worksheet of
46 the specified Excel workbook file.
47 * @param map
48 * @param fileSpec
49 * @param sheet
50 */
51 public static void storeXL(Map map, String fileSpec, String sheet) {
52 try {
53 FileOutputStream out = new FileOutputStream(fileSpec);
54 HSSFWorkbook workbook = new HSSFWorkbook();
55 HSSFSheet worksheet = workbook.createSheet(sheet);
56 Set countries = map.keySet();
57 short rowNum = 0;
58 for (Object country : countries) {
59 Object population = map.get(country);
60 HSSFRow row = worksheet.createRow(rowNum);
61 row.createCell(0).setCellValue((String)country);
62 row.createCell(1).setCellValue((Integer)population);
63 ++rowNum;
64 }
65 workbook.write(out);
66 out.flush();
67 out.close();
68 } catch (FileNotFoundException e) {
69 System.err.println(e);
70 } catch (IOException e) {
71 System.err.println(e);
72 }
73 }
74 }
    • 读取csv文件

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.HashMap;
4 import java.util.Scanner;
5
6 public class ReadingCSVFiles {
7 public static void main(String[] args) {
8 File dataFile = new File("data/Countries.csv");
9 try {
10 Scanner input = new Scanner(dataFile);
11 input.useDelimiter(",|\\s");
12 String column1 = input.next();
13 String column2 = input.next();
14 System.out.printf("%-10s%12s%n", column1, column2);
15 while (input.hasNext()) {
16 String country = input.next();
17 int population = input.nextInt();
18 System.out.printf("%-10s%,12d%n", country, population);
19 }
20 } catch (FileNotFoundException e) {
21 System.out.println(e);
22 }
23 }
24 }
    • 读取csv到Map

 1 import static dawj.ch02.FromMapToExcel.print;
2 import java.io.FileInputStream;
3 import java.io.FileNotFoundException;
4 import java.io.IOException;
5 import java.util.Map;
6 import java.util.TreeMap;
7 import org.apache.poi.hssf.usermodel.HSSFCell;
8 import org.apache.poi.hssf.usermodel.HSSFRow;
9 import org.apache.poi.hssf.usermodel.HSSFSheet;
10 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
11 import org.apache.poi.ss.usermodel.DataFormatter;
12 import org.apache.poi.ss.usermodel.Row;
13
14 public class FromExcelToMap {
15 public static void main(String[] args) {
16 Map map = loadXL("data/Countries.xls", "Countries Worksheet");
17 print(map);
18 }
19
20 /** Returns a Map object containing the data from the specified
21 worksheet in the specified Excel file.
22 */
23 public static Map loadXL(String fileSpec, String sheetName) {
24 Map<String,Integer> map = new TreeMap();
25 try {
26 FileInputStream stream = new FileInputStream(fileSpec);
27 HSSFWorkbook workbook = new HSSFWorkbook(stream);
28 HSSFSheet worksheet = workbook.getSheet(sheetName);
29 DataFormatter formatter = new DataFormatter();
30 for (Row row : worksheet) {
31 HSSFRow hssfRow = (HSSFRow)row;
32 HSSFCell cell = hssfRow.getCell(0);
33 String country = cell.getStringCellValue();
34 cell = hssfRow.getCell(1);
35 String str = formatter.formatCellValue(cell);
36 int population = (int)Integer.getInteger(str);
37 map.put(country, population);
38 }
39 } catch (FileNotFoundException e) {
40 System.err.println(e);
41 } catch (IOException e) {
42 System.err.println(e);
43 }
44 return map;
45 }
46 }
  • 解析JSON文件

 1 import java.io.File;
2 import java.io.FileInputStream;
3 import java.io.FileNotFoundException;
4 import java.io.IOException;
5 import java.io.InputStream;
6 import java.util.ArrayList;
7 import java.util.HashMap;
8 import javax.json.Json;
9 import javax.json.stream.JsonParser;
10 import javax.json.stream.JsonParser.Event;
11
12 public class ParsingJSONFiles {
13 public static void main(String[] args) {
14 File dataFile = new File("data/Books.json");
15 try {
16 InputStream stream = new FileInputStream(dataFile);
17 JsonParser parser = Json.createParser(stream);
18 Event event = parser.next(); // advance past START_OBJECT
19 HashMap<String,Object> map = getMap(parser);
20 System.out.println(map);
21 stream.close();
22 } catch (FileNotFoundException e) {
23 System.out.println(e);
24 } catch (IOException e) {
25 System.out.println(e);
26 }
27 }
28
29 /* Returns the HashMap parsed by the specified parser.
30 Called when event.equals(event.START_OBJECT):
31 */
32 public static HashMap getMap(JsonParser parser) {
33 HashMap<String,Object> map = new HashMap();
34 Event event = parser.next(); // advance past START_OBJECT
35 String key = parser.getString();
36 event = parser.next(); // advance past KEY_NAME
37 while (!event.equals(Event.END_OBJECT)) {
38 if (event.equals(Event.VALUE_STRING)) {
39 String value = parser.getString();
40 map.put(key, value);
41 } else if (event.equals(Event.VALUE_NUMBER)) {
42 Integer value = parser.getInt();
43 map.put(key, value);
44 } else if (event.equals(Event.START_ARRAY)) {
45 ArrayList<String> list = getList(parser);
46 map.put(key, list);
47 }
48 event = parser.next();
49 if (event.equals(Event.END_OBJECT)) {
50 break;
51 }
52 key = parser.getString();
53 event = parser.next();
54 }
55 return map;
56 }
57
58 /* Returns the ArrayList parsed by the specified parser.
59 Called when event.equals(event.START_ARRAY):
60 */
61 public static ArrayList getList(JsonParser parser) {
62 ArrayList list = new ArrayList();
63 Event event = parser.next(); // advance past START_ARRAY
64 while (!event.equals(Event.END_ARRAY)) {
65 if (event.equals(Event.VALUE_STRING)) {
66 list.add(parser.getString());
67 event = parser.next();
68 } else if (event.equals(Event.START_OBJECT)) {
69 HashMap<String,Object> map = getMap(parser);
70 list.add(map);
71 event = parser.next();
72 } else if (event.equals(Event.START_ARRAY)) {
73 ArrayList subList = getList(parser); // recursion
74 list.add(subList);
75 event = parser.next();
76 }
77 }
78 return list;
79 }
80 }

数据处理

  • 生成测试数据集

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.io.PrintWriter;
4 import java.util.Random;
5
6 public class GeneratingTestData {
7 private static final int ROWS = 8, COLS = 5;
8 private static final Random RANDOM = new Random();
9
10 public static void main(String[] args) {
11 File outputFile = new File("data/Output.csv");
12 try {
13 PrintWriter writer = new PrintWriter(outputFile);
14 for (int i = 0; i < ROWS; i++) {
15 for (int j = 0; j < COLS-1; j++) {
16 writer.printf("%.6f,", RANDOM.nextDouble());
17 }
18 writer.printf("%.6f%n", RANDOM.nextDouble());
19 }
20 writer.close();
21 } catch (FileNotFoundException e) {
22 System.err.println(e);
23 }
24 }
25 }
  • 数据过滤

    • 需求:选择国土面积超过100万平米的内陆国家
    • 过程:数据为dat格式,先定义对应简单类country,再写程序将dat中数据存在country的Set中,最后做筛选

Country.java

 1 import java.util.HashSet;
2 import java.util.Scanner;
3
4 class Country {
5 protected String name;
6 protected int population;
7 protected int area;
8 protected boolean landlocked;
9
10 /* Constructs a new Country object from the next line being scanned.
11 If there are no more lines, the new object's fields are left null.
12 */
13 public Country(Scanner in) {
14 if (in.hasNextLine()) {
15 this.name = in.next();
16 this.population = in.nextInt();
17 this.area = in.nextInt();
18 this.landlocked = in.nextBoolean();
19 }
20 }
21
22 @Override
23 public String toString() {
24 return String.format("%-10s %,12d %,12d %b",
25 name, population, area, landlocked);
26 }
27 }

FilteringData.java

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.HashSet;
4 import java.util.Scanner;
5 import java.util.Set;
6 import java.util.TreeMap;
7
8 public class FilteringData {
9 private static final int MIN_AREA = 1000000; // one million
10 public static void main(String[] args) {
11 File file = new File("data/Countries.dat");
12 Set<Country> dataset = readDataset(file);
13
14 for (Country country : dataset) {
15 if (country.landlocked && country.area >= MIN_AREA) {
16 System.out.println(country);
17 }
18 }
19 }
20
21 public static Set readDataset(File file) {
22 Set<Country> set = new HashSet();
23 try {
24 Scanner input = new Scanner(file);
25 input.nextLine(); // read past headers
26 while (input.hasNextLine()) {
27 set.add(new Country(input));
28 }
29 input.close();
30 } catch (FileNotFoundException e) {
31 System.out.println(e);
32 }
33 return set;
34 }
35 }
  • 排序

    • 需求:将contries.dat中数据按population进行排序
    • 实现:将数据存入TreeMap
    • 注意:关键字段必须唯一,即两个国家人口不能相同

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.Collections;
4 import java.util.HashMap;
5 import java.util.Scanner;
6 import java.util.Set;
7 import java.util.TreeMap;
8
9 public class SortingData {
10 public static void main(String[] args) {
11 File file = new File("src/main/java/com/hongfeng/SortingData/Countries.dat");
12 TreeMap<Integer,String> dataset = new TreeMap();
13 try {
14 Scanner input = new Scanner(file);
15 while (input.hasNext()) {
16 String x = input.next();
17 int y = input.nextInt();
18 dataset.put(y, x);
19 }
20 input.close();
21 } catch (FileNotFoundException e) {
22 System.out.println(e);
23 }
24 print(dataset);
25 }
26
27 public static void print(TreeMap<Integer,String> map) {
28 for (Integer key : map.keySet()) {
29 System.out.printf("%,12d %-16s%n", key, map.get(key));
30 }
31 }
32 }
  • 合并

    • 需求:将多个排好序的文件合并为单个排好序的文件
    • country类继承Comparable,定义从文件创建对象的构造方法,以及比较方法
    • 扫描两个文件,比较,存入新文件,一个文件扫描完后,另一个文件逐项扫描即可

Country.java

 1 import java.util.Scanner;
2
3 class Country implements Comparable{
4 protected String name;
5 protected int population;
6
7 /* Constructs a new Country object from the next line being scanned.
8 If there are no more lines, the new object's fields are left null.
9 */
10 public Country(Scanner in) {
11 if (in.hasNextLine()) {
12 this.name = in.next();
13 this.population = in.nextInt();
14 }
15 }
16
17 public boolean isNull(){
18 return this.name == null;
19 }
20
21 @Override
22 public int compareTo(Object object){
23 Country that = (Country)object;
24 return this.population - that.population;
25 }
26
27 @Override
28 public String toString() {
29 return String.format("%-10s %,12d",
30 name, population);
31 }
32 }

MergingFiles

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.io.PrintWriter;
4 import java.util.Scanner;
5
6 public class MergingFiles {
7 public static void main(String[] args) {
8 File inFile1 = new File("data/Countries1.dat");
9 File inFile2 = new File("data/Countries2.dat");
10 File outFile = new File("data/Countries.dat");
11 try {
12 Scanner in1 = new Scanner(inFile1);
13 Scanner in2 = new Scanner(inFile2);
14 PrintWriter out = new PrintWriter(outFile);
15 Country country1 = new Country(in1);
16 Country country2 = new Country(in2);
17 System.out.println(country1.hashCode());
18 System.out.println(country2.hashCode());
19 while (!country1.isNull() && !country2.isNull()) {
20 if (country1.compareTo(country2) < 0) {
21 out.println(country1);
22 country1 = new Country(in1);
23 } else {
24 out.println(country2);
25 country2 = new Country(in2);
26 }
27 }
28 while (!country1.isNull()) {
29 out.println(country1);
30 country1 = new Country(in1);
31 }
32 while (!country2.isNull()) {
33 out.println(country2);
34 country2 = new Country(in2);
35 }
36 in1.close();
37 in2.close();
38 out.close();
39 } catch (FileNotFoundException e) {
40 System.out.println(e);
41 }
42 }
43 }

[Java] 数据分析--数据预处理的更多相关文章

  1. pandas神器操作excel表格大全(数据分析数据预处理)

    使用pandas库操作excel,csv表格操作大全 关注公众号"轻松学编程"了解更多,文末有公众号二维码,可以扫码关注哦. 前言 准备三份csv表格做演示: 成绩表.csv su ...

  2. [Java]数据分析--数据可视化

    时间序列 需求:将一组字符顺序添加到时间序列中 实现:定义时间序列类TimeSeries,包含静态类Entry表示序列类中的各项,以及add,get,iterator,entry方法 TimeSeri ...

  3. 小白学 Python 数据分析(9):Pandas (八)数据预处理(2)

    人生苦短,我用 Python 前文传送门: 小白学 Python 数据分析(1):数据分析基础 小白学 Python 数据分析(2):Pandas (一)概述 小白学 Python 数据分析(3):P ...

  4. 【新人赛】阿里云恶意程序检测 -- 实践记录10.20 - 数据预处理 / 训练数据分析 / TF-IDF模型调参

    Colab连接与数据预处理 Colab连接方法见上一篇博客 数据预处理: import pandas as pd import pickle import numpy as np # 训练数据和测试数 ...

  5. EEGLAB数据分析:预处理与后续处理

    来源:http://blog.sina.com.cn/s/blog_13171a73d0102v4zx.html 数据预处理主要包括数据导入.电极定位.电极返回.滤波.去除伪迹.重建参考.分段.叠加平 ...

  6. Java大数据应用领域及就业方向

    最难毕业季,2017高校毕业生达到795万,许多学生面临着毕业即失业的尴尬.面对着与日俱增的竞争形势和就业压力,很多毕业生选择去知了堂学习社区镀金,以提高自己的就业竞争力,其中Java大数据是学生选择 ...

  7. Java大数据人才应用领域广,就业薪酬高

    互联网创造了大数据应用的规模化环境,大数据应用成功的案例大都是在互联网上发生的, 互联网业务提供了数据,互联网企业开发了处理软件,互联网企业的创新带来了大数据应用 的活跃,没有互联网便没有今天的大数据 ...

  8. 数据准备<3>:数据预处理

    数据预处理是指因为算法或者分析需要,对经过数据质量检查后的数据进行转换.衍生.规约等操作的过程.整个数据预处理工作主要包括五个方面内容:简单函数变换.标准化.衍生虚拟变量.离散化.降维.本文将作展开介 ...

  9. 【sklearn】数据预处理 sklearn.preprocessing

    数据预处理 标准化 (Standardization) 规范化(Normalization) 二值化 分类特征编码 推定缺失数据 生成多项式特征 定制转换器 1. 标准化Standardization ...

随机推荐

  1. PAT (Advanced Level) Practice 1019 General Palindromic Number (20 分) 凌宸1642

    PAT (Advanced Level) Practice 1019 General Palindromic Number (20 分) 凌宸1642 题目描述: A number that will ...

  2. SpringCloud(六)分布式事务

    在分布式系统中,分布式事务基本上是绕不开的, 分布式事务是指事务的参与者.支持事务的服务器.资源服务器以及事务管理器分别位于不同的分布式系统的不同节点之上 .其实就可以简单理解成在分布式系统中实现事务 ...

  3. Istio 网络弹性 实践 之 故障注入 和 调用重试

    网络弹性介绍 网络弹性也称为运维弹性,是指网络在遇到灾难事件时快速恢复和继续运行的能力.灾难事件的范畴很广泛,比如长时间停电.网络设备故障.恶意入侵等. 重试(attempts) Istio 重试机制 ...

  4. Golang学习的方法和建议

    学习方法: 学习方向:go方向是没有问题的 学习方法:多思考多练习,注重语法和关键词练习,切记哑巴学习,会看不会写,切记注意多写 课外学习,数据结构和算法:清华 谭浩强老师(链表.数组.排序...等等 ...

  5. 翻译:《实用的Python编程》07_04_Function_decorators

    目录 | 上一节 (7.3 返回函数) | 下一节 (7.5 装饰方法) 7.4 函数装饰器 本节介绍装饰器(decorator).因为这是一个高级主题,所以我们只做简单介绍. 译注:根据译者个人的猜 ...

  6. Dynamic Programming 动态规划入门笔记

    算法导论笔记 programming 指的是一种表格法,并非编写计算机程序 动态规划与分治方法相似,都是通过组合子问题的解来求解原问题.但是分治法将问题划分为互不相交的子问题.而动态规划是应用与子问题 ...

  7. centos7.4 卸载python2.7.5安装python3.6.3版本

    CentOS 中默认安装了 2.7的Python,为了使用新版 python,可以对旧版本进行升级.但是由于很多基本的命令.软件包都依赖旧版本,比如:yum等.所以,在更新 Python 时,建议不要 ...

  8. 17. Vue2.4+新增属性$listeners

    现在我们来讨论一种情况,A组件与C组件怎么通信,我们有多少种解决方案? 我们使用VueX来进行数据管理,但是如果项目中多个组件共享状态比较少,项目比较小,并且全局状态比较少,那使用VueX来实现该功能 ...

  9. Salesforce学习之路(六)利用Visualforce Page实现页面的动态刷新功能

    Visualforce是一个Web开发框架,允许开发人员构建可以在Lightning平台上本地托管的自定义用户界面.其框架包含:前端的界面设计,使用的类似于HTML的标记语言:以及后端的控制器,使用类 ...

  10. 刨死你系列——手撕ArrayList

    不多BB,直接上代码: public class MyArrayList { //创建数组对象 private Object[] elements; //已使用数组长度 private int siz ...