require:

/**
* before:
* file A1.csv {1,2,3,4,5}
* file A2.csv {2,3,9,10,11}
* file B1.csv {5,12,13,14,15}
* file B2.csv {16,14,15,4,9,20,30}
* A1.csv A2.csv A3.csv A4.csv cant not repeat
*
* after:
* file A1.csv {1,4}
* file A2.csv {2,3,10,11}
* file B1.csv {12,13}
* file B2.csv {16,9,20,30}
*/

  

tangxin@tangxin:~/csvrepeat$ ls
A1.csv A2.csv B1.csv B2.csv

  

CSVUtilVersion2.java

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.springframework.util.StringUtils; import java.io.*;
import java.lang.reflect.Array;
import java.util.*; /**
* before:
* file A1.csv {1,2,3,4,5}
* file A2.csv {2,3,9,10,11}
* file B1.csv {5,12,13,14,15}
* file B2.csv {16,14,15,4,9,20,30}
* A1.csv A2.csv A3.csv A4.csv cant not repeat
*
* after:
* file A1.csv {1,4}
* file A2.csv {2,3,10,11}
* file B1.csv {12,13}
* file B2.csv {16,9,20,30}
*/
@Slf4j
public class CSVUtilVersion2 { private static final String CSV_PATH = "/home/tangxin/csvrepeat/";
private static final boolean CREATE_SWITCH = true; /**
* read single column data list
* @param path
* @return
*/
public static List<String> ids(String path) {
List<String> result = new ArrayList<>();
File csv = new File(path); // CSV文件路径
LineIterator it = null;
try {
it = FileUtils.lineIterator(csv);
while (it.hasNext()) {
String line = it.nextLine();
if (line.trim().contains("ID")) {
continue;
}
String[] arr = line.split(",");
String ID = arr[0];
ID = ID.replaceAll("\"", "").trim();
if (!StringUtils.isEmpty(ID)) {
result.add(ID);
}
}
} catch (Exception e) {
log.error("读取ID csv文件失败:{}", e.getMessage());
} finally {
LineIterator.closeQuietly(it);
}
return result;
} /**
* from src delete oth
* @param src
* @param oth
* @return
*/
public static List removeAll(List src, List oth) {
LinkedList result = new LinkedList(src);
HashSet othHash = new HashSet(oth);
Iterator iter = result.iterator();
while (iter.hasNext()) {
if (othHash.contains(iter.next())) {
iter.remove();
}
}
return result;
} /**
* -Xms1g -Xmx1g -XX:PermSize=128m -XX:SurvivorRatio=2 -XX:+UseParallelGC
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception { //∑=1+2+3+...+(n-1) group LinkedList<String> fileList = new LinkedList<>();
fileList.add("A1.csv");
fileList.add("A2.csv");
fileList.add("B1.csv");
fileList.add("B2.csv");
// fileList.add("C1.csv"); DescartesRepeat(fileList); ded(fileList); } private static void DescartesRepeat(LinkedList<String> fileList) {
Set<String> repeatList = new HashSet<>(); Set<String> groupSet = new HashSet<>(); Set<String> goONList = new HashSet<>(); //A1->A2,B1,B2
for (int i = 0; i < fileList.size(); i++) { String itemI = fileList.get(i); for (int j = 0; j < fileList.size(); j++) { String itemJ = fileList.get(j); if (!itemI.equals(itemJ)) { String groupR1 = itemI + "->" + itemJ;
String groupR2 = itemJ + "->" + itemI; if (groupSet.contains(groupR1) || groupSet.contains(groupR2)){
continue;
} groupSet.add(groupR1); String repeatT = repeat(CSV_PATH + itemI, CSV_PATH + itemJ);
if(!StringUtils.isEmpty(repeatT)){
repeatList.add(repeatT);
//System.out.println(groupR1+"->"+repeatT);
} } }
} if (CollectionUtils.isNotEmpty(repeatList)) {
// System.out.println(repeatList);
for (String repeatItem : repeatList) {
Iterator<String> iterator = fileList.iterator();
while (iterator.hasNext()) {
String oldItem = iterator.next(); String oldS = oldItem.replace(".csv", "").replace("-new","");
String repeatS = repeatItem.replace(".csv","").replace("-new","");
if (repeatS.contains(oldS)) {
iterator.remove();
goONList.add(repeatItem);
}
}
}
fileList.addAll(goONList);
System.out.println(fileList);
DescartesRepeat(fileList);
}
} public static void ded(List<String> args) { //保证指定csv列表每组都不能有重复数据
for (int i = 0; i < args.size(); i++) {
// if(i>0){
// continue;
// } String source = CSV_PATH + args.get(i); for (int j = 0; j < args.size(); j++) { if (i == j) {
continue;
} String target = CSV_PATH + args.get(j);
intersection(source, target);
} } } public static void intersection(String sourcePath, String targetPath) {
List<String> ids1 = ids(sourcePath);
List<String> ids2 = ids(targetPath);
List<String> inter = (List<String>) CollectionUtils.intersection(ids1, ids2);
System.out.println(sourcePath + "和" + targetPath + "的重复数据大小" + inter.size());
} public static String repeat(String source, String target){
//cdd fund xyd List<String> ids1 = ids(source);
List<String> ids2 = ids(target); // System.out.println(source + "集合大小" + ids1.size());
// System.out.println(target + "集合大小" + ids2.size()); List<String> inter = (List<String>) CollectionUtils.intersection(ids1, ids2); // System.out.println("去重数据大小:" + inter.size()); if (inter != null && inter.size() > 0) { if (ids1.size() > ids2.size()) {
return repeatInner(source, ids1, inter);
} else if (ids2.size() > ids1.size()) {
return repeatInner(target, ids2, inter);
} else {
return repeatInner(source, ids1, inter);
} } return "";
} private static String repeatInner(String source, List<String> ids, List<String> inter) {
String newPath = source.replace(".csv", "-new.csv");
List<String> ids1new = removeAll(ids, inter);
createCSV(ids1new, newPath);
return newPath.replace(CSV_PATH,"");
} /**
* 创建CSV文件
*/
public static void createCSV(List<String> list, String fileName) { if(!CREATE_SWITCH){
// System.out.println("创建csv开关关闭");
return;
}else{
// System.out.println("创建csv开关开启");
} // 表格头
Object[] head = {"ID"};
List<Object> headList = Arrays.asList(head); //数据
List<List<Object>> dataList = new ArrayList<>();
List<Object> rowList = null;
for (int i = 0; i < list.size(); i++) {
rowList = new ArrayList<>();
rowList.add(list.get(i));
dataList.add(rowList);
} File csvFile;
BufferedWriter csvWtriter = null;
try {
csvFile = new File(fileName);
File parent = csvFile.getParentFile();
if (parent != null && !parent.exists()) {
parent.mkdirs();
}
csvFile.createNewFile(); // GB2312使正确读取分隔符","
csvWtriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csvFile), "GB2312"), 1024); // 写入文件头部
writeRow(headList, csvWtriter); // 写入文件内容
for (List<Object> row : dataList) {
writeRow(row, csvWtriter);
}
csvWtriter.flush();
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
csvWtriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
} /**
* 写一行数据
*
* @param row 数据列表
* @param csvWriter
* @throws IOException
*/
private static void writeRow(List<Object> row, BufferedWriter csvWriter) throws IOException {
for (Object data : row) {
StringBuffer sb = new StringBuffer();
String rowStr = sb.append("\"").append(data).append("\",").toString();
csvWriter.write(rowStr);
}
csvWriter.newLine();
} }

  

tangxin@tangxin:~/csvrepeat$ ls
A1.csv A1-new.csv A1-new-new.csv A2.csv A2-new.csv B1.csv B2.csv B2-new.csv B2-new-new.csv

  

java csv list cant not repeat的更多相关文章

  1. Java CSV操作(导出和导入)

    Java CSV操作(导出和导入)  CSV是逗号分隔文件(Comma Separated Values)的首字母英文缩写,是一种用来存储数据的纯文本格式,通常用于电子表格或数据库软件.在 CSV文件 ...

  2. java csv - 读写及其操作.

    今天帮同学处理数据, 主要是从1w多条记录中随机获取8k条, 然后再从8k条记录中随机获取2k条记录. 最后将2k条记录中随机分成10组,使得每组的记录都不重复. 下面将我的代码都贴上来, 好以后处理 ...

  3. java csv 文件 操作类

    一个CSV文件操作类,功能比较齐全: package tool; import java.io.BufferedReader; import java.io.BufferedWriter; impor ...

  4. 【Java】Java CSV操作代码

    CSV是逗号分隔文件(Comma Separated Values)的首字母英文缩写,是一种用来存储数据的纯文本格式,通常用于电子表格或数据库软件.在 CSV文件中,数据“栏”以逗号分隔,可允许程序通 ...

  5. 【Java/Csv/Regex】用正则表达式去劈分带引号的csv文件行,得到想要的行数据

    csv文件是用引号分隔的文本行,为了完善内容人们又用引号把每个区块的内容又包了起来,于是形成下面的文件: "1","2","3"," ...

  6. 【Java/csv】一个CSV文件解析类(转载)

    /*下文写得不错,值得学习**/ import java.io.BufferedReader; import java.io.FileReader; import java.util.ArrayLis ...

  7. Java csv

    CsvWriter csvWriter = new CsvWriter("data2019052803.csv", ',', Charset.forName("UTF-8 ...

  8. java csv文件写入

    List<String> list_code = null; 方案1 控制字符集: BufferedWriter bw=new BufferedWriter(new OutputStrea ...

  9. 统一的Json组件和csv下载组件

    java-web-common java-web-common Json组件 目标和用途 规范Json接口格式 Controller中一律返回Java object,组件将自动转换数据格式,满足Jso ...

随机推荐

  1. UML绘制活动图--客户来电咨询活动图

    选择Logic View–>New–>Activity Diagram 修改NewActivity为客户来电咨询 选择初始状态和终止状态(下图中上面是Start State,下面是End ...

  2. Linux 服务器之间文件传输

    linux的scp命令: scp就是secure copy的简写,用于在linux下进行远程拷贝文件的命令,和它类似的命令有cp,不过cp只是在本机进行拷贝不能跨服务器. 有时我们需要获得远程服务器上 ...

  3. 快速玩转linux(1)

    快速上手Linux玩转典型应用 mark 大牛都会使用Linux, Linux命令是行业要求. 商业服务器基本都是linux 开源软件都先支持Linux(只支持) 大数据分析.机器学习首选Linux ...

  4. jQuery(三)HTML

    获得内容: text() - 设置或返回所选元素的文本内容 html() - 设置或返回所选元素的内容(包括 HTML 标记) val() - 设置或返回表单字段的值 <html> < ...

  5. ESP32 LyraT音频开发板试玩(二):播放音乐

    我是卓波,很高兴你来看我的博客. 系列文章: ESP32 LyraT音频开发板试玩(一):搭建开发环境 ESP32 LyraT音频开发板试玩(二):播放音乐 本文延续上一篇博客 将D:\msys32\ ...

  6. Python自动化运维——文件内容差异对比

    Infi-chu: http://www.cnblogs.com/Infi-chu/ 模块:difflib 安装:Python版本大于等于2.3系统自带 功能:对比文本之间的差异,而且支持输出可读性比 ...

  7. ionic打包apkFailed to execute shell command "input,keyevent,82"" on device: Error: adb: Command failed with exit code 137

    错误代码如下 BUILD SUCCESSFUL in 12s 46 actionable tasks: 1 executed, 45 up-to-date Built the following ap ...

  8. consul 使用方式

    1.在配置文件配置好的情况下,在运行 consul agent -server -datacenter=([xacl.json].[acl_datacenter]) -bootstrap -data- ...

  9. 形象的理解Strong和Weak

    Strong Weak

  10. PostgreSQL字段名和表名大小写的问题

    创建表的时候,表名和字段名必须全小写,然后查询的时候不管全大写或全小写,或是Camel模式都不会报错.只要名称中有大写字母,或者全大写,查询时就必须保证大小写正确并用双引号包起来,否则就会报“XXX不 ...