数据结构

  • 键-值对:HashMap

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.HashMap;
4 import java.util.Scanner;
5
6 public class HashMapExample {
7 public static void main(String[] args) {
8 File dataFile = new File("data/Countries.dat");
9 HashMap<String,Integer> dataset = new HashMap();
10 try {
11 Scanner input = new Scanner(dataFile);
12 while (input.hasNext()) {
13 String country = input.next();
14 int population = input.nextInt();
15 dataset.put(country, population);
16 }
17 } catch (FileNotFoundException e) {
18 System.out.println(e);
19 }
20 System.out.printf("dataset.size(): %d%n", dataset.size());
21 System.out.printf("dataset.get(\"Peru\"): %,d%n", dataset.get("Peru"));
22 }
23 }

文件处理

  • csv文件

    • 将Map数据存入csv文件  

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.io.FileOutputStream;
4 import java.io.IOException;
5 import java.util.Map;
6 import java.util.Scanner;
7 import java.util.Set;
8 import java.util.TreeMap;
9 import org.apache.poi.hssf.usermodel.HSSFRow;
10 import org.apache.poi.hssf.usermodel.HSSFSheet;
11 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
12
13 public class FromMapToExcel {
14 public static void main(String[] args) {
15 Map<String,Integer> map = new TreeMap();
16 load(map, "data/Countries.dat");
17 print(map);
18 storeXL(map, "data/Countries.xls", "Countries Worksheet");
19 }
20
21 /** Loads the data from the specified file into the specified map.
22 */
23 public static void load(Map map, String fileSpec) {
24 File file = new File(fileSpec);
25 try {
26 Scanner input = new Scanner(file);
27 while (input.hasNext()) {
28 String country = input.next();
29 int population = input.nextInt();
30 map.put(country, population);
31 }
32 } catch (FileNotFoundException e) {
33 System.out.println(e);
34 }
35 }
36
37 public static void print(Map map) {
38 Set countries = map.keySet();
39 for (Object country : countries) {
40 Object population = map.get(country);
41 System.out.printf("%-10s%,12d%n", country, population);
42 }
43 }
44
45 /** Stores the specified map in the specified worksheet of
46 the specified Excel workbook file.
47 * @param map
48 * @param fileSpec
49 * @param sheet
50 */
51 public static void storeXL(Map map, String fileSpec, String sheet) {
52 try {
53 FileOutputStream out = new FileOutputStream(fileSpec);
54 HSSFWorkbook workbook = new HSSFWorkbook();
55 HSSFSheet worksheet = workbook.createSheet(sheet);
56 Set countries = map.keySet();
57 short rowNum = 0;
58 for (Object country : countries) {
59 Object population = map.get(country);
60 HSSFRow row = worksheet.createRow(rowNum);
61 row.createCell(0).setCellValue((String)country);
62 row.createCell(1).setCellValue((Integer)population);
63 ++rowNum;
64 }
65 workbook.write(out);
66 out.flush();
67 out.close();
68 } catch (FileNotFoundException e) {
69 System.err.println(e);
70 } catch (IOException e) {
71 System.err.println(e);
72 }
73 }
74 }
    • 读取csv文件

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.HashMap;
4 import java.util.Scanner;
5
6 public class ReadingCSVFiles {
7 public static void main(String[] args) {
8 File dataFile = new File("data/Countries.csv");
9 try {
10 Scanner input = new Scanner(dataFile);
11 input.useDelimiter(",|\\s");
12 String column1 = input.next();
13 String column2 = input.next();
14 System.out.printf("%-10s%12s%n", column1, column2);
15 while (input.hasNext()) {
16 String country = input.next();
17 int population = input.nextInt();
18 System.out.printf("%-10s%,12d%n", country, population);
19 }
20 } catch (FileNotFoundException e) {
21 System.out.println(e);
22 }
23 }
24 }
    • 读取csv到Map

 1 import static dawj.ch02.FromMapToExcel.print;
2 import java.io.FileInputStream;
3 import java.io.FileNotFoundException;
4 import java.io.IOException;
5 import java.util.Map;
6 import java.util.TreeMap;
7 import org.apache.poi.hssf.usermodel.HSSFCell;
8 import org.apache.poi.hssf.usermodel.HSSFRow;
9 import org.apache.poi.hssf.usermodel.HSSFSheet;
10 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
11 import org.apache.poi.ss.usermodel.DataFormatter;
12 import org.apache.poi.ss.usermodel.Row;
13
14 public class FromExcelToMap {
15 public static void main(String[] args) {
16 Map map = loadXL("data/Countries.xls", "Countries Worksheet");
17 print(map);
18 }
19
20 /** Returns a Map object containing the data from the specified
21 worksheet in the specified Excel file.
22 */
23 public static Map loadXL(String fileSpec, String sheetName) {
24 Map<String,Integer> map = new TreeMap();
25 try {
26 FileInputStream stream = new FileInputStream(fileSpec);
27 HSSFWorkbook workbook = new HSSFWorkbook(stream);
28 HSSFSheet worksheet = workbook.getSheet(sheetName);
29 DataFormatter formatter = new DataFormatter();
30 for (Row row : worksheet) {
31 HSSFRow hssfRow = (HSSFRow)row;
32 HSSFCell cell = hssfRow.getCell(0);
33 String country = cell.getStringCellValue();
34 cell = hssfRow.getCell(1);
35 String str = formatter.formatCellValue(cell);
36 int population = (int)Integer.getInteger(str);
37 map.put(country, population);
38 }
39 } catch (FileNotFoundException e) {
40 System.err.println(e);
41 } catch (IOException e) {
42 System.err.println(e);
43 }
44 return map;
45 }
46 }
  • 解析JSON文件

 1 import java.io.File;
2 import java.io.FileInputStream;
3 import java.io.FileNotFoundException;
4 import java.io.IOException;
5 import java.io.InputStream;
6 import java.util.ArrayList;
7 import java.util.HashMap;
8 import javax.json.Json;
9 import javax.json.stream.JsonParser;
10 import javax.json.stream.JsonParser.Event;
11
12 public class ParsingJSONFiles {
13 public static void main(String[] args) {
14 File dataFile = new File("data/Books.json");
15 try {
16 InputStream stream = new FileInputStream(dataFile);
17 JsonParser parser = Json.createParser(stream);
18 Event event = parser.next(); // advance past START_OBJECT
19 HashMap<String,Object> map = getMap(parser);
20 System.out.println(map);
21 stream.close();
22 } catch (FileNotFoundException e) {
23 System.out.println(e);
24 } catch (IOException e) {
25 System.out.println(e);
26 }
27 }
28
29 /* Returns the HashMap parsed by the specified parser.
30 Called when event.equals(event.START_OBJECT):
31 */
32 public static HashMap getMap(JsonParser parser) {
33 HashMap<String,Object> map = new HashMap();
34 Event event = parser.next(); // advance past START_OBJECT
35 String key = parser.getString();
36 event = parser.next(); // advance past KEY_NAME
37 while (!event.equals(Event.END_OBJECT)) {
38 if (event.equals(Event.VALUE_STRING)) {
39 String value = parser.getString();
40 map.put(key, value);
41 } else if (event.equals(Event.VALUE_NUMBER)) {
42 Integer value = parser.getInt();
43 map.put(key, value);
44 } else if (event.equals(Event.START_ARRAY)) {
45 ArrayList<String> list = getList(parser);
46 map.put(key, list);
47 }
48 event = parser.next();
49 if (event.equals(Event.END_OBJECT)) {
50 break;
51 }
52 key = parser.getString();
53 event = parser.next();
54 }
55 return map;
56 }
57
58 /* Returns the ArrayList parsed by the specified parser.
59 Called when event.equals(event.START_ARRAY):
60 */
61 public static ArrayList getList(JsonParser parser) {
62 ArrayList list = new ArrayList();
63 Event event = parser.next(); // advance past START_ARRAY
64 while (!event.equals(Event.END_ARRAY)) {
65 if (event.equals(Event.VALUE_STRING)) {
66 list.add(parser.getString());
67 event = parser.next();
68 } else if (event.equals(Event.START_OBJECT)) {
69 HashMap<String,Object> map = getMap(parser);
70 list.add(map);
71 event = parser.next();
72 } else if (event.equals(Event.START_ARRAY)) {
73 ArrayList subList = getList(parser); // recursion
74 list.add(subList);
75 event = parser.next();
76 }
77 }
78 return list;
79 }
80 }

数据处理

  • 生成测试数据集

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.io.PrintWriter;
4 import java.util.Random;
5
6 public class GeneratingTestData {
7 private static final int ROWS = 8, COLS = 5;
8 private static final Random RANDOM = new Random();
9
10 public static void main(String[] args) {
11 File outputFile = new File("data/Output.csv");
12 try {
13 PrintWriter writer = new PrintWriter(outputFile);
14 for (int i = 0; i < ROWS; i++) {
15 for (int j = 0; j < COLS-1; j++) {
16 writer.printf("%.6f,", RANDOM.nextDouble());
17 }
18 writer.printf("%.6f%n", RANDOM.nextDouble());
19 }
20 writer.close();
21 } catch (FileNotFoundException e) {
22 System.err.println(e);
23 }
24 }
25 }
  • 数据过滤

    • 需求:选择国土面积超过100万平米的内陆国家
    • 过程:数据为dat格式,先定义对应简单类country,再写程序将dat中数据存在country的Set中,最后做筛选

Country.java

 1 import java.util.HashSet;
2 import java.util.Scanner;
3
4 class Country {
5 protected String name;
6 protected int population;
7 protected int area;
8 protected boolean landlocked;
9
10 /* Constructs a new Country object from the next line being scanned.
11 If there are no more lines, the new object's fields are left null.
12 */
13 public Country(Scanner in) {
14 if (in.hasNextLine()) {
15 this.name = in.next();
16 this.population = in.nextInt();
17 this.area = in.nextInt();
18 this.landlocked = in.nextBoolean();
19 }
20 }
21
22 @Override
23 public String toString() {
24 return String.format("%-10s %,12d %,12d %b",
25 name, population, area, landlocked);
26 }
27 }

FilteringData.java

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.HashSet;
4 import java.util.Scanner;
5 import java.util.Set;
6 import java.util.TreeMap;
7
8 public class FilteringData {
9 private static final int MIN_AREA = 1000000; // one million
10 public static void main(String[] args) {
11 File file = new File("data/Countries.dat");
12 Set<Country> dataset = readDataset(file);
13
14 for (Country country : dataset) {
15 if (country.landlocked && country.area >= MIN_AREA) {
16 System.out.println(country);
17 }
18 }
19 }
20
21 public static Set readDataset(File file) {
22 Set<Country> set = new HashSet();
23 try {
24 Scanner input = new Scanner(file);
25 input.nextLine(); // read past headers
26 while (input.hasNextLine()) {
27 set.add(new Country(input));
28 }
29 input.close();
30 } catch (FileNotFoundException e) {
31 System.out.println(e);
32 }
33 return set;
34 }
35 }
  • 排序

    • 需求:将contries.dat中数据按population进行排序
    • 实现:将数据存入TreeMap
    • 注意:关键字段必须唯一,即两个国家人口不能相同

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.Collections;
4 import java.util.HashMap;
5 import java.util.Scanner;
6 import java.util.Set;
7 import java.util.TreeMap;
8
9 public class SortingData {
10 public static void main(String[] args) {
11 File file = new File("src/main/java/com/hongfeng/SortingData/Countries.dat");
12 TreeMap<Integer,String> dataset = new TreeMap();
13 try {
14 Scanner input = new Scanner(file);
15 while (input.hasNext()) {
16 String x = input.next();
17 int y = input.nextInt();
18 dataset.put(y, x);
19 }
20 input.close();
21 } catch (FileNotFoundException e) {
22 System.out.println(e);
23 }
24 print(dataset);
25 }
26
27 public static void print(TreeMap<Integer,String> map) {
28 for (Integer key : map.keySet()) {
29 System.out.printf("%,12d %-16s%n", key, map.get(key));
30 }
31 }
32 }
  • 合并

    • 需求:将多个排好序的文件合并为单个排好序的文件
    • country类继承Comparable,定义从文件创建对象的构造方法,以及比较方法
    • 扫描两个文件,比较,存入新文件,一个文件扫描完后,另一个文件逐项扫描即可

Country.java

 1 import java.util.Scanner;
2
3 class Country implements Comparable{
4 protected String name;
5 protected int population;
6
7 /* Constructs a new Country object from the next line being scanned.
8 If there are no more lines, the new object's fields are left null.
9 */
10 public Country(Scanner in) {
11 if (in.hasNextLine()) {
12 this.name = in.next();
13 this.population = in.nextInt();
14 }
15 }
16
17 public boolean isNull(){
18 return this.name == null;
19 }
20
21 @Override
22 public int compareTo(Object object){
23 Country that = (Country)object;
24 return this.population - that.population;
25 }
26
27 @Override
28 public String toString() {
29 return String.format("%-10s %,12d",
30 name, population);
31 }
32 }

MergingFiles

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.io.PrintWriter;
4 import java.util.Scanner;
5
6 public class MergingFiles {
7 public static void main(String[] args) {
8 File inFile1 = new File("data/Countries1.dat");
9 File inFile2 = new File("data/Countries2.dat");
10 File outFile = new File("data/Countries.dat");
11 try {
12 Scanner in1 = new Scanner(inFile1);
13 Scanner in2 = new Scanner(inFile2);
14 PrintWriter out = new PrintWriter(outFile);
15 Country country1 = new Country(in1);
16 Country country2 = new Country(in2);
17 System.out.println(country1.hashCode());
18 System.out.println(country2.hashCode());
19 while (!country1.isNull() && !country2.isNull()) {
20 if (country1.compareTo(country2) < 0) {
21 out.println(country1);
22 country1 = new Country(in1);
23 } else {
24 out.println(country2);
25 country2 = new Country(in2);
26 }
27 }
28 while (!country1.isNull()) {
29 out.println(country1);
30 country1 = new Country(in1);
31 }
32 while (!country2.isNull()) {
33 out.println(country2);
34 country2 = new Country(in2);
35 }
36 in1.close();
37 in2.close();
38 out.close();
39 } catch (FileNotFoundException e) {
40 System.out.println(e);
41 }
42 }
43 }

[Java] 数据分析--数据预处理的更多相关文章

  1. pandas神器操作excel表格大全(数据分析数据预处理)

    使用pandas库操作excel,csv表格操作大全 关注公众号"轻松学编程"了解更多,文末有公众号二维码,可以扫码关注哦. 前言 准备三份csv表格做演示: 成绩表.csv su ...

  2. [Java]数据分析--数据可视化

    时间序列 需求:将一组字符顺序添加到时间序列中 实现:定义时间序列类TimeSeries,包含静态类Entry表示序列类中的各项,以及add,get,iterator,entry方法 TimeSeri ...

  3. 小白学 Python 数据分析(9):Pandas (八)数据预处理(2)

    人生苦短,我用 Python 前文传送门: 小白学 Python 数据分析(1):数据分析基础 小白学 Python 数据分析(2):Pandas (一)概述 小白学 Python 数据分析(3):P ...

  4. 【新人赛】阿里云恶意程序检测 -- 实践记录10.20 - 数据预处理 / 训练数据分析 / TF-IDF模型调参

    Colab连接与数据预处理 Colab连接方法见上一篇博客 数据预处理: import pandas as pd import pickle import numpy as np # 训练数据和测试数 ...

  5. EEGLAB数据分析:预处理与后续处理

    来源:http://blog.sina.com.cn/s/blog_13171a73d0102v4zx.html 数据预处理主要包括数据导入.电极定位.电极返回.滤波.去除伪迹.重建参考.分段.叠加平 ...

  6. Java大数据应用领域及就业方向

    最难毕业季,2017高校毕业生达到795万,许多学生面临着毕业即失业的尴尬.面对着与日俱增的竞争形势和就业压力,很多毕业生选择去知了堂学习社区镀金,以提高自己的就业竞争力,其中Java大数据是学生选择 ...

  7. Java大数据人才应用领域广,就业薪酬高

    互联网创造了大数据应用的规模化环境,大数据应用成功的案例大都是在互联网上发生的, 互联网业务提供了数据,互联网企业开发了处理软件,互联网企业的创新带来了大数据应用 的活跃,没有互联网便没有今天的大数据 ...

  8. 数据准备<3>:数据预处理

    数据预处理是指因为算法或者分析需要,对经过数据质量检查后的数据进行转换.衍生.规约等操作的过程.整个数据预处理工作主要包括五个方面内容:简单函数变换.标准化.衍生虚拟变量.离散化.降维.本文将作展开介 ...

  9. 【sklearn】数据预处理 sklearn.preprocessing

    数据预处理 标准化 (Standardization) 规范化(Normalization) 二值化 分类特征编码 推定缺失数据 生成多项式特征 定制转换器 1. 标准化Standardization ...

随机推荐

  1. Android Studio 之 EditText

    EditText 简介 •简介 EditText是一个非常重要的组件,可以说它是用户和Android应用进行数据传输窗户: 有了它就等于有了一扇和Android应用传输的门,通过它用户可以把数据传给A ...

  2. js 更改json的 key

    let t = data.map(item => { return{ fee: item['费用'], companyName1: item.companyName, remark1: item ...

  3. Spring Cloud Alibaba(2)---Nacos概述

    Spring Cloud Alibaba(2)---nacos概述 上一篇博客讲了有关 SpringCloudAlibaba的概述,这篇开始讲SpringCloudAlibaba组件之一---Naco ...

  4. ON DUPLICATE KEY UPDATE作用

    ON DUPLICATE KEY UPDATE作用 先声明一点,ON DUPLICATE KEY UPDATE为Mysql特有语法,这是个坑 语句的作用,当insert已经存在的记录时,执行Updat ...

  5. 在Visual Studio 中使用git——什么是Git(一)

    写程序必然需要版本控制,哪怕是个人项目也是必须的,微软从Visual Studio 2019开始默认提供了对Git的支持,Visual Studio 2019之前的版本可以安装相应的插件来实现Git功 ...

  6. 观世音甘泉活树的故事竟然是Java设计模式:备忘录模式

    目录 定义 意图 主要解决问题 何时使用 优缺点 结构 白箱实现 黑箱实现 多重检查点 观世音甘泉活树的故事 定义 备忘录模式是对象的行为型模式,备忘录对象是一个用来存储另外一个对象内部状态的快照的对 ...

  7. JDBC_04_使用Properties集合保存JDBC所需配置信息

    使用Properties集合保存JDBC所需配置信息 将JDBC连接所需的配置信息保存在一个配置文件中,然后使用Properties将该信息存储起来,动态的完成JDBC的配置连接 代码: import ...

  8. linux gcc命令参数

    gcc命令参数笔记 1. gcc -E source_file.c -E,只执行到预处理.直接输出预处理结果. 2. gcc -S source_file.c -S,只执行到汇编,输出汇编代码. 3. ...

  9. Python表达式进阶——列表表达式

    x = 0 y = x*2 if x >= 0 else x print(y) # [表达式for变量in列表] l1 = [] l2 = [i for i in range(100) if i ...

  10. 展开说说,Spring Bean IOC、AOP 循环依赖

    作者:小傅哥 博客:https://bugstack.cn 沉淀.分享.成长,让自己和他人都能有所收获! 一.前言 延迟满足能给你带来什么? 大学有四年时间,但几乎所有人都是临近毕业才发现找一份好工作 ...