使用hash拆分文件

package readImgUrl;

import java.io.BufferedInputStream;

import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileOutputStream;

import java.io.FileReader;

import java.io.FileWriter;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.OutputStreamWriter;

import java.net.URL;

import java.util.ArrayList;

import java.util.Arrays;

import java.util.Collections;

import java.util.Comparator;

import java.util.List;

public class ClassifyUrl {

	private static int HASHLEN = 100;

	private static String file_dir = "D:\\学习\\实验室项目\\ImageNet图片爬取\\classify_url\\";

	private static String src_file = "D:\\学习\\实验室项目\\ImageNet图片爬取\\fall11_urls.txt";

	public static void main(String[] args) throws Exception {

		// TODO Auto-generated method stub

		classify_url("D:\\学习\\实验室项目\\ImageNet图片爬取\\fall11_urls.txt");

//		rank_filedata("2");

//		String s = judgeFileCode(src_file);

//		String s = codeString(src_file);

//		System.out.println(s);

	}

	/**

	 * 对一个文件进行排序

	 */

	public static void rank_filedata(String filename){

		String path1 = file_dir+filename+".txt";

		String path2 = file_dir+filename+"_"+".txt";

		List<String> list = reader_list(path1);

		System.out.println(list.size());

		// 排序,通过泛型和匿名类来实现

        Collections.sort(list, new Comparator<String>() {

            public int compare(String s1, String s2) {

            	String h1 = s1.split("	")[1];

            	String h2 = s2.split("	")[1];

            	return h1.compareTo(h2);

            }

        });

		writer_list(list, path2);

	}

	/**

	 * 读取文件，返回list

	 * @param path

	 * @return

	 */

	public static List reader_list(String path){

		List<String> lineList = new ArrayList();

		try {

			BufferedReader reader = new BufferedReader(new FileReader(path));

			String line = reader.readLine();

			while(null != line){

				lineList.add(line);

				line = reader.readLine();

			}

			reader.close();

			return lineList;

		} catch (Exception e) {

			// TODO: handle exception

			e.printStackTrace();

		}

		return null;

	}

	/**

	 * 将List写入文件

	 * @param line

	 */

	public static void writer_list(List list, String path){

		try {

			BufferedWriter writer = new BufferedWriter(new FileWriter(path));

			for(int i=0; i<list.size(); i++){

				String line = (String)list.get(i);

				writer.write(line+"\r\n");

			}

			writer.close();

		} catch (Exception e) {

			// TODO: handle exception

			e.printStackTrace();

		}

	}

	/**

	 * 从文件中逐行读取数据，分类写入0-99个文件

	 */

	public static void classify_url(String path){

		try {

			BufferedReader reader ;

			String filecode = judgeFileCode(path);

			reader = new BufferedReader(new InputStreamReader(new FileInputStream(path),filecode));

//			BufferedReader reader = new BufferedReader(new FileReader(path));

			String line = reader.readLine();

			int line_num = 0;

//			while(line_num<4101000){

//				reader.readLine();

//				line_num++;

//			}

			while(null != line){

				try {

					String host = new URL(line.split("	")[1]).getHost();

					int type = hash(host.toCharArray());

//					writer(type+"", line);

				} catch (Exception e) {

					// TODO: handle exception

					e.printStackTrace();

				}

				line = reader.readLine();

				line_num++;

				if(line_num%100==0){

//					System.out.println(line_num);

					char [] cc = line.toCharArray();

					for(char c: cc){

						if(isCnorEn(c)){

							System.out.println(line);

							break;

						}

					}

//					break;

				}

			}

			reader.close();

		} catch (Exception e) {

			// TODO: handle exception

			e.printStackTrace();

		}

	}

	/**

	 * 判断是中文还是英文字符

	 */

	static boolean isCnorEn(char c) {

		if ((c >= 0x0391 && c <= 0xFFE5) // 中文字符

				|| (c >= 0x0000 && c <= 0x00FF)) // 英文字符

			return true;

		return false;

//		if ((c >= 0x0391 && c <= 0xFFE5) // 英文字符

//				) //

//			return true;

//		return false;

	}

	/**

	 * 给定一个字符串，返回hash后的int值

	 * @param word

	 * @return

	 */

	public static int hash(char[] word) {

		int index = 0;

	    int i=0;

	    while(i<word.length) {

	        index += index * 31 + word[i];

	        i++;

	    }

	    return Math.abs(index % HASHLEN);

	}

	/**

	 * 将line写入filename中（文件不存在则先建立）

	 * @param filename

	 * @param line

	 */

	public static void writer(String filename, String line){

		String path = file_dir+filename+".txt";

		try {

			File file = new File(path);

			if(!file.isFile()){

				file.createNewFile();

			}

			String filecode = judgeFileCode(src_file);

			OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(path, true), "GBK");

//			BufferedWriter writer = new BufferedWriter(new FileWriter(path, true));

			if(null != line){

				writer.write(line+"\r\n");

			}

			writer.close();

		} catch (Exception e) {

			// TODO: handle exception

			e.printStackTrace();

		}

	}

	public static String judgeFileCode(String path){

		try {

			File file = new File(path);

			InputStream in= new java.io.FileInputStream(file);

			byte[] b = new byte[3];

			in.read(b);

			in.close();

			if (b[0] == -17 && b[1] == -69 && b[2] == -65)  {

//				System.out.println(file.getName() + "：编码为UTF-8");

				return "UTF-8";

			}

			else{

//				System.out.println(file.getName() + "：可能是GBK，也可能是其他编码");

				return "GBK";

			}

		} catch (Exception e) {

			// TODO: handle exception

		}

		return null;

	}

	/**

     * 判断文件的编码格式

     * @param fileName :file

     * @return 文件编码格式

     * @throws Exception

     */

    public static String codeString(String fileName) throws Exception{

        BufferedInputStream bin = new BufferedInputStream(new FileInputStream(fileName));

        int p = (bin.read() << 8) + bin.read();

        String code = null;

        //其中的 0xefbb、0xfffe、0xfeff、0x5c75这些都是这个文件的前面两个字节的16进制数

        switch (p) {

            case 0xefbb:

                code = "UTF-8";

                break;

            case 0xfffe:

                code = "Unicode";

                break;

            case 0xfeff:

                code = "UTF-16BE";

                break;

            case 0x5c75:

                code = "ANSI|ASCII" ;

                break ;

            default:

                code = "GBK";

        }

        return code;

    }

}

使用hash拆分文件的更多相关文章

Linux split拆分文件
200 ? "200px" : this.width)!important;} --> 介绍 split可以将一个大文件拆分成指定大小的多个文件,并且拆分速度非常的快,拆分一 ...
linux_shell_拆分文件_多进程脚本
[需求场景]:一个10000w行的文件处理 ,多进程处理比如启动100个进程同时处理. [方法]:拆分文件(split) ,制作shell脚本执行后台进程 [demo]: 假设处理程序为 ...
linux 拆分文件
split [OPTION]... [INPUT [PREFIX]] :根据行或者大小拆分文件 split file_name :默认把文件file_name拆分成xaa,xab,xac,...... ...
linux拆分文件
1.先看下文件总的行数: wc -l filename 我们现在来看看它具体的参数该怎么用: split支持自定义输出文件大小和输出文件行数两种模式,此外还可以定义每一行最大的值. -l 按输出文件行 ...
split - 拆分文件
拆分文件 # 每个文件的行数为1000行 split -l 1000 test.txt # 将test文件拆分,20M一个文件 split -b 20M test.txt test文件拆分,并且文件名 ...
casperjs在拆分文件后的中文乱码问题的解决
windows环境. capserjs的中文乱码使用phantom.outputEncoding="GBK";即可解决. 但当我们脚本很大,需要拆分时(参考http://docs. ...
java 按内容拆分文件
文件内容为: BC************* **************** *************** BC************* **************** *********** ...
python_基础学习_02_拆分文件（spilt）
做爬虫经常会有这样的引用场景 ,原始网页存储格式为 url+\t+ html php 有个explode的拆分文本行方法,比较方便直接接收列值 list($url,$html)=explode(& ...
RandomAccessFile拆分合并文件
import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java. ...

随机推荐

OpenSSL(2)创建私有证书颁发机构
如果想要建立自己的CA, OpenSSL已经包含了所有你需要的东西.所有的操作都通过纯命令行执行,虽然不那么友好,整个过程也比较长,但是这可以让你去思考每个细节. 我建议自己创建一套私有的 CA主要是 ...
wampserver2.2 在window2003下的安装的主要问题
准备安装最新的wampserver 2.2c, 1.安装问题,安装完成后总是无法启动服务系统事件中提示错误找不到附属汇编 Microsoft.VC90.CRT,上一个错误是参照的汇编没有 ...
PXE自动化部署
PXE 预启动执行环境,基于tftp条件下完成基于网络的自动化部署软件原理: 网卡利用自身的tftp 请求dhcp 服务器获取ip和一个pxelinux.0的地址在给定的tftp目录下存有ks的配 ...
权限和ACL访问控制-02-特殊权限
X(大写) X:给目录x权限,不给文件x权限(当文件本来就有x权限的话会重新赋予x权限) 例如: chmod -R +X dir2 SUID SUID属性一般运用在可执行文件上,当用户执行该执行文件时 ...
使用nfs制作动态分配存储卷
参考文献:https://yq.aliyun.com/articles/613036 相对于静态存储, 动态存储的优势: ● 管理员无需预先创建大量的PV作为存储资源; ● 静态存储需要用户申请PVC ...
nginx跨域设置&文件上传大小限制
在部署项目的时候碰到这么一个问题:XMLHttpRequest cannot load,下面阐述一下这个问题问题背景: 用nginx+tomcat部署项目.tomcat用的8080端口,nginx用 ...
Rest_Framework简介
Web应用模式在开发Web应用中,有两种应用模式:前后端不分离和前后端分离前后端不分离前后端不分离通俗来讲就是不区分前端和后端,浏览器请求时服务器直接返回页面,其示意图如下前后端分离前后端分 ...
springmvc4.3.7中使用RequestBody，传入json参数时，得到错误415 Unsupported Media Type
在新建一个maven的项目的时候,当时并非springboot项目,是通过xml来配置的项目.在项目中DispatcherServlet的配置文件中配置了annotation-driven的, < ...
Python之基于十六进制判断文件类型
核心代码: #!/usr/bin/env python # -*- coding: utf-8 -*- # @Author : suk import struct from io import Byt ...
three months timestamp
1.有效期三个月 package com.hengqin.life.idps; import java.text.SimpleDateFormat; import java.util.Calendar ...

使用hash拆分文件

使用hash拆分文件的更多相关文章

随机推荐

热门专题