Hadoop 文本分类终于跑通了

Training

入口

package org.wordCount;

import java.util.ArrayList;

import java.util.List;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileStatus;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

public class WordMain {

    // private static List<String> secondDir = new ArrayList<String>();

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        //下面两行很重要

        conf.set("mapred.jar", "E://eclipse//jar-work//WordMain.jar");

        conf.set("mapred.job.tracker", "192.168.190.128:9001");

        //设置单词先验概率的保存路径

        String priorProbality = "hdfs://192.168.190.128:9000/user/hadoop/output/priorP/priorProbability.txt";

        conf.set("priorProbality", priorProbality);

        //单词总种类数的保存路径

        String totalWordsPath = "hdfs://192.168.190.128:9000/user/hadoop/output/totalwords.txt";

        conf.set("totalWordsPath", totalWordsPath);

        //每个类别中单词总数

        String wordsInClassPath = "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsFrequence/_wordsInClass/wordsInClass-r-00000";

        conf.set("wordsInClassPath", wordsInClassPath);

        //设置输入 和 单词词频的输出路径

        // "/user/hadoop/input/NBCorpus/Country"

        String input = "hdfs://192.168.190.128:9000/user/hadoop/input/NBCorpus/Country";

        String wordsOutput = "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsFrequence";

        conf.set("input", input);

        conf.set("wordsOutput", wordsOutput);

        //每个类别单词概率保存路径,

        //单词词频的输入路径也就是单词词频的输出路径

        String freqOutput = "hdfs://192.168.190.128:9000/user/hadoop/output/probability/";

        conf.set("freqOutput", freqOutput);

        FileCount.run(conf);

        WordCount.run(conf);

        Probability.run(conf);

/*

        System.out.print("----------");

        String[] otherArgs = new String[] { "hdfs://192.168.190.128:9000/user/hadoop/test/",

                "hdfs://192.168.190.128:9000/user/hadoop/wordcount/output2/" };

        conf.set("mapred.jar", "E://eclipse//jar-work//WordMain.jar");

        Job job = new Job(conf, "word count");

        job.setJarByClass(WordMain.class);

        job.setInputFormatClass(MyInputFormat.class);

        job.setMapperClass(WordMapper.class);

//        job.setCombinerClass(WordReducer.class);

        job.setReducerClass(WordReducer.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(IntWritable.class);

        // MyUtils.addInputPath(job, new Path(otherArgs[0]), conf);

        List<Path> inputPaths = getSecondDir(conf, otherArgs[0]);

        for (Path path : inputPaths) {

            System.out.println("path = " + path.toString());

            MyInputFormat.addInputPath(job, path);

        }

        System.out.println("addinputpath     ok" );

//        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);*/

    }

    // 获取文件夹下面二级文件夹路径的方法

    static List<Path> getSecondDir(Configuration conf, String folder) throws Exception {

        Path path = new Path(folder);

        FileSystem fs = path.getFileSystem(conf);

        FileStatus[] stats = fs.listStatus(path);

        List<Path> folderPath = new ArrayList<Path>();

        for (FileStatus stat : stats) {

            if (stat.isDir()) {

                if (fs.listStatus(stat.getPath()).length > 10) { // 筛选出文件数大于10个的类别作为

                                                                    // 输入路径

                    folderPath.add(stat.getPath());

                }

            }

        }

        return folderPath;

    }

}

统计各个类别文本数

package org.wordCount;

import java.util.HashMap;

import java.util.Iterator;

import java.util.Map;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FSDataOutputStream;

import org.apache.hadoop.fs.FileStatus;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IOUtils;

/**

 *

 * 获取文件个数，并计算先验概率 先验概率保存在/user/hadoop/output/priorP/prior.txt

 *

 */

public class FileCount {

    public static void run(Configuration conf) throws Exception {

        int sum = 0;

        String in = conf.get("input");

        Map<String, Integer> map = new HashMap<>();

        Map<String, Double> priorMap = new HashMap<>();

        // map传值（需要筛选测试集，有的类别文本数太少要删除）

        map = FileCount.getFileNumber(in);

        //测试打印出每个类别和文件总数

        Iterator<Map.Entry<String, Integer>> itrs = map.entrySet().iterator();

        while (itrs.hasNext()) {

//            System.out.println("ok");

            Map.Entry<String, Integer> it = itrs.next();

            if(it.getValue() <= 10){    //这两行代码可以不计算文本数少于10的类别

                itrs.remove();

            }else{

                sum += it.getValue();

                System.out.println(it.getKey() + "\t" + it.getValue());

            }

        }

        System.out.println("sum = " + sum);

        String output = conf.get("priorProbality");

        Path outputPath = new Path(output);

        FileSystem fs = outputPath.getFileSystem(conf);

        FSDataOutputStream outputStream = fs.create(outputPath);

        //计算每个类别文本占总文本的比率，即先验概率

        String ctx = "";

        for (Map.Entry<String, Integer> entry : map.entrySet()) {

            Double result = 0.0;

            result = Double.parseDouble(entry.getValue().toString()) / sum;

            priorMap.put(entry.getKey(), result);//保存在priorMap中

            ctx += entry.getKey() + "\t" + result + "\n";

        }

        outputStream.writeBytes(ctx);

        IOUtils.closeStream(outputStream);

        // 打印概率信息，同时可以写入文件中

        // map的另外一种遍历方法

        Iterator<Map.Entry<String, Double>> iterators = priorMap.entrySet().iterator();

        while (iterators.hasNext()) {

            Map.Entry<String, Double> iterator = iterators.next();

            System.out.println(iterator.getKey() + "\t" + iterator.getValue());

        }

    }

    // get 方法

    public static Map<String, Integer> getFileNumber(String folderPath) throws Exception {

        Map<String, Integer> fileMap = new HashMap<>();

        Configuration conf = new Configuration();

        Path path = new Path(folderPath);

        FileSystem hdfs = path.getFileSystem(conf);

        FileStatus[] status = hdfs.listStatus(path);

//        System.out.println(folderPath);

//        System.out.println("status.length = " + status.length);

        for (FileStatus stat : status) {

            if (stat.isDir()) {

                int length = hdfs.listStatus(stat.getPath()).length;

                String name = stat.getPath().getName();

                fileMap.put(name, length);

            }

        }

        return fileMap;

    }

}

文本中单词计数

package org.wordCount;

import java.io.IOException;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FSDataOutputStream;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IOUtils;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Counter;

import org.apache.hadoop.mapreduce.Counters;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

public class WordCount {

    private static MultipleOutputs<Text, IntWritable> mos;

    // static String baseOutputPath = "/user/hadoop/test_out";

    // 设计两个map分别计算每个类别的文本数//和每个类别的单词总数

    // private static Map<String, List<String>> fileCountMap = new

    // HashMap<String, List<String>>();

    // private static Map<String, Integer> fileCount = new HashMap<String,

    // Integer>();

    // static Map<String, List<String>> wordsCountInClassMap = new

    // HashMap<String, List<String>>();

    static enum WordsNature {

        CLSASS_NUMBER, CLASS_WORDS, TOTALWORDS

    }

    // map

    static class First_Mapper extends Mapper<Text, Text, Text, IntWritable> {

        private final static IntWritable one = new IntWritable(1);

        private final static IntWritable zero = new IntWritable(0);

        private Text countryName = new Text();

        @Override

        protected void map(Text key, Text value, Mapper<Text, Text, Text, IntWritable>.Context context)

                throws IOException, InterruptedException {

            StringTokenizer itr = new StringTokenizer(value.toString());

            while (itr.hasMoreTokens()) {

                String word = itr.nextToken();

                if (!(MyUtils.hasDigit(word) || word.contains("."))) { // 去掉无意义词

                    countryName.set(key.toString() + "\t" + word);

                    context.write(countryName, one); // 统计每个类别中的单词个数 ABL have 1

                    context.write(key, one); // 统计类别中的单词总数

                    context.write(new Text(word), zero); // 统计单词总数

                }

            }

        }

    }

    // Reducer

    static class First_Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {

        // result 表示每个类别中每个单词的个数

        IntWritable result = new IntWritable();

        Map<String, List<String>> classMap = new HashMap<String, List<String>>();

        Map<String, List<String>> fileMap = new HashMap<String, List<String>>();

        @Override

        protected void reduce(Text key, Iterable<IntWritable> values,

                Reducer<Text, IntWritable, Text, IntWritable>.Context context)

                        throws IOException, InterruptedException {

            int sum = 0;

            for (IntWritable value : values) {

                sum += value.get();

            }

            // sum为0，总得单词数加1，统计所有单词的种类

            if (sum == 0) {

                context.getCounter(WordsNature.TOTALWORDS).increment(1);

            } else {// sum不为0时，通过key的长度来判断，

                String[] temp = key.toString().split("\t");

                if (temp.length == 2) { // 用tab分隔类别和单词

                    result.set(sum);

                    context.write(key, result);

                    // mos.write(new Text(temp[1]), result, temp[0]);

                } else { // 类别中单词总数

                    result.set(sum);

                    mos.write(key, result, "_wordsInClass" + "\\" + "wordsInClass");

                }

            }

        }

        @Override

        protected void cleanup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)

                throws IOException, InterruptedException {

            mos.close();

        }

        @Override

        protected void setup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)

                throws IOException, InterruptedException {

            mos = new MultipleOutputs<Text, IntWritable>(context);

        }

    }

    public static int run(Configuration conf) throws Exception {

//        Configuration conf = new Configuration();

        // System.out.print("---run-------");

        // 设置不同文件的路径

        // 文本数路径

//        String priorProbality = "hdfs://192.168.190.128:9000/user/hadoop/output/priorP/priorProbality.txt";

//        conf.set("priorProbality", priorProbality);

        Job job = new Job(conf, "file count");

        job.setJarByClass(WordCount.class);

        job.setInputFormatClass(MyInputFormat.class);

        job.setMapperClass(WordCount.First_Mapper.class);

        job.setReducerClass(WordCount.First_Reducer.class);

        // System.out.println("---job-------");

        // 过滤掉文本数少于10的类别

        String input = conf.get("input");

        List<Path> inputPaths = MyUtils.getSecondDir(conf, input);

        for (Path path : inputPaths) {

            System.out.println("path = " + path.toString());

            MyInputFormat.addInputPath(job, path);

        }

        String wordsOutput = conf.get("wordsOutput");

        FileOutputFormat.setOutputPath(job, new Path(wordsOutput));

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(IntWritable.class);

        int exitCode = job.waitForCompletion(true) ? 0 : 1;

        // 调用计数器

        Counters counters = job.getCounters();

        Counter c1 = counters.findCounter(WordsNature.TOTALWORDS);

        System.out.println("-------------->>>>: " + c1.getDisplayName() + ":" + c1.getName() + ": " + c1.getValue());

        // 将单词种类数写入文件中

        Path totalWordsPath = new Path("hdfs://192.168.190.128:9000/user/hadoop/output/totalwords.txt");

        FileSystem fs = totalWordsPath.getFileSystem(conf);

        FSDataOutputStream outputStream = fs.create(totalWordsPath);

        outputStream.writeBytes(c1.getDisplayName() + ":" + c1.getValue());

        IOUtils.closeStream(outputStream);

        // 下次求概率是尝试单词总种类数写到configuration中

        //

        // conf.set("TOTALWORDS", totalWords.toString());

        return exitCode;

    }

}

MyInputFormat

package org.wordCount;

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.net.URI;

import java.util.HashMap;

import java.util.Map;

import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.DoubleWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

public class Probability {

    private static final Log LOG = LogFactory.getLog(FileInputFormat.class);

    public static int total = 0;

    private static MultipleOutputs<Text, DoubleWritable> mos;

    // Client

    public static void run(Configuration conf) throws Exception {

        // 读取单词总数，设置到congfiguration中

        String totalWordsPath = conf.get("totalWordsPath");

//        String wordsInClassPath = conf.get("wordsInClassPath");

        // 先读取单词总类别数

        FileSystem fs = FileSystem.get(URI.create(totalWordsPath), conf);

        FSDataInputStream inputStream = fs.open(new Path(totalWordsPath));

        BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));

        String strLine = buffer.readLine();

        String[] temp = strLine.split(":");

        if (temp.length == 2) {

            // temp[0] = TOTALWORDS

            conf.set(temp[0], temp[1]);// 设置两个String

        }

        total = Integer.parseInt(conf.get("TOTALWORDS"));

        LOG.info("------>total = " + total);

        System.out.println("total ==== " + total);

        Job job = new Job(conf, "file count");

        job.setJarByClass(Probability.class);

        job.setMapperClass(WordsOfClassCountMapper.class);

        job.setReducerClass(WordsOfClassCountReducer.class);

        String input = conf.get("wordsOutput");

        String output = conf.get("freqOutput");

        FileInputFormat.addInputPath(job, new Path(input));

        FileOutputFormat.setOutputPath(job, new Path(output));

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(DoubleWritable.class);

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

    // Mapper

    static class WordsOfClassCountMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {

        private static DoubleWritable number = new DoubleWritable();

        private static Text className = new Text();

        // 保存类别中单词总数

        private static Map<String, Integer> filemap = new HashMap<String, Integer>();

        protected void map(LongWritable key, Text value,

                Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)

                        throws IOException, InterruptedException {

            Configuration conf = context.getConfiguration();

            int tot = Integer.parseInt(conf.get("TOTALWORDS"));

            System.out.println("total = " + total);

            System.out.println("tot = " + tot);

            // 输入的格式如下：

            // ALB weekend 1

            // ALB weeks 3

            Map<String, Map<String, Integer>> baseMap = new HashMap<String, Map<String, Integer>>(); // 保存基础数据

            // Map<String, Map<String, Double>> priorMap = new HashMap<String,

            // Map<String, Double>>(); // 保存每个单词出现的概率

            String[] temp = value.toString().split("\t");

            // 先将数据存到baseMap中

            if (temp.length == 3) {

                // 文件夹名类别名

                if (baseMap.containsKey(temp[0])) {

                    baseMap.get(temp[0]).put(temp[1], Integer.parseInt(temp[2]));

                } else {

                    Map<String, Integer> oneMap = new HashMap<String, Integer>();

                    oneMap.put(temp[1], Integer.parseInt(temp[2]));

                    baseMap.put(temp[0], oneMap);

                }

            } // 读取数据完毕，全部保存在baseMap中

            int allWordsInClass = 0;

            for (Map.Entry<String, Map<String, Integer>> entries : baseMap.entrySet()) { // 遍历类别

                allWordsInClass = filemap.get(entries.getKey());

                for (Map.Entry<String, Integer> entry : entries.getValue().entrySet()) { // 遍历类别中的单词词频求概率

                    double p = (entry.getValue() + 1.0) / (allWordsInClass + tot);

                    className.set(entries.getKey() + "\t" + entry.getKey());

                    number.set(p);

                    LOG.info("------>p = " + p);

                    mos.write(new Text(entry.getKey()), number, entries.getKey() /*+ "\\" + entries.getKey()*/);//最后一个参数是为了生成文件夹对应的文件

//                    context.write(className, number);

                }

            }

        }

        //最后计算类别中不存在单词的概率，每个类别都是一个常数

        protected void cleanup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)

                throws IOException, InterruptedException {

            Configuration conf = context.getConfiguration();

            int tot = Integer.parseInt(conf.get("TOTALWORDS"));

            for (Map.Entry<String, Integer> entry : filemap.entrySet()) { // 遍历类别

                double notFind =  (1.0) / (entry.getValue() + tot);

                number.set(notFind);

                mos.write(new Text(entry.getKey()), number, "_notFound" + "\\" +"notFound");

            }

            mos.close();

        }

        protected void setup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)

                throws IOException, InterruptedException {

            // TODO Auto-generated method stub

            Configuration conf = context.getConfiguration();

            mos = new MultipleOutputs<Text, DoubleWritable>(context);

            String filePath = conf.get("wordsInClassPath");

            FileSystem fs = FileSystem.get(URI.create(filePath), conf);

            FSDataInputStream inputStream = fs.open(new Path(filePath));

            BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));

            String strLine = null;

            while ((strLine = buffer.readLine()) != null) {

                String[] temp = strLine.split("\t");

                filemap.put(temp[0], Integer.parseInt(temp[1]));

            }

        }

    }

    // Reducer

    static class WordsOfClassCountReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {

        // result 表示每个文件里面单词个数

        DoubleWritable result = new DoubleWritable();

        // Configuration conf = new Configuration();

        // int total = conf.getInt("TOTALWORDS", 1);

        protected void reduce(Text key, Iterable<DoubleWritable> values,

                Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)

                        throws IOException, InterruptedException {

            double sum = 0L;

            for (DoubleWritable value : values) {

                sum += value.get();

            }

            result.set(sum);

            context.write(key, result);

        }

    }

}

两个小工具

package org.wordCount;

import java.util.ArrayList;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileStatus;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

public class MyUtils {

    // 获取文件夹下面二级文件夹路径的方法

        static List<Path> getSecondDir(Configuration conf, String folder) throws Exception {

//            System.out.println("----getSencondDir----" + folder);

            Path path = new Path(folder);

            FileSystem fs = path.getFileSystem(conf);

            FileStatus[] stats = fs.listStatus(path);

            System.out.println("stats.length = " + stats.length);

            List<Path> folderPath = new ArrayList<Path>();

            for (FileStatus stat : stats) {

                if (stat.isDir()) {

//                    System.out.println("----stat----" + stat.getPath());

                    if (fs.listStatus(stat.getPath()).length > 10) { // 筛选出文件数大于10个的类别作为

                                                                        // 输入路径

                        folderPath.add(stat.getPath());

                    }

                }

            }

//            System.out.println("----folderPath----" + folderPath.size());

            return folderPath;

        }

        // 判断一个字符串是否含有数字

        static boolean hasDigit(String content) {

            boolean flag = false;

            Pattern p = Pattern.compile(".*\\d+.*");

            Matcher m = p.matcher(content);

            if (m.matches())

                flag = true;

            return flag;

        }

}

计算概率

package org.wordCount;

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.net.URI;

import java.util.HashMap;

import java.util.Map;

import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.DoubleWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

public class Probability {

    private static final Log LOG = LogFactory.getLog(FileInputFormat.class);

    public static int total = 0;

    private static MultipleOutputs<Text, DoubleWritable> mos;

    // Client

    public static void run(Configuration conf) throws Exception {

        // 读取单词总数，设置到congfiguration中

        String totalWordsPath = conf.get("totalWordsPath");

//        String wordsInClassPath = conf.get("wordsInClassPath");

        // 先读取单词总类别数

        FileSystem fs = FileSystem.get(URI.create(totalWordsPath), conf);

        FSDataInputStream inputStream = fs.open(new Path(totalWordsPath));

        BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));

        String strLine = buffer.readLine();

        String[] temp = strLine.split(":");

        if (temp.length == 2) {

            // temp[0] = TOTALWORDS

            conf.set(temp[0], temp[1]);// 设置两个String

        }

        total = Integer.parseInt(conf.get("TOTALWORDS"));

        LOG.info("------>total = " + total);

        System.out.println("total ==== " + total);

        Job job = new Job(conf, "file count");

        job.setJarByClass(Probability.class);

        job.setMapperClass(WordsOfClassCountMapper.class);

        job.setReducerClass(WordsOfClassCountReducer.class);

        String input = conf.get("wordsOutput");

        String output = conf.get("freqOutput");

        FileInputFormat.addInputPath(job, new Path(input));

        FileOutputFormat.setOutputPath(job, new Path(output));

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(DoubleWritable.class);

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

    // Mapper

    static class WordsOfClassCountMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {

        private static DoubleWritable number = new DoubleWritable();

        private static Text className = new Text();

        // 保存类别中单词总数

        private static Map<String, Integer> filemap = new HashMap<String, Integer>();

        protected void map(LongWritable key, Text value,

                Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)

                        throws IOException, InterruptedException {

            Configuration conf = context.getConfiguration();

            int tot = Integer.parseInt(conf.get("TOTALWORDS"));

            System.out.println("total = " + total);

            System.out.println("tot = " + tot);

            // 输入的格式如下：

            // ALB weekend 1

            // ALB weeks 3

            Map<String, Map<String, Integer>> baseMap = new HashMap<String, Map<String, Integer>>(); // 保存基础数据

            // Map<String, Map<String, Double>> priorMap = new HashMap<String,

            // Map<String, Double>>(); // 保存每个单词出现的概率

            String[] temp = value.toString().split("\t");

            // 先将数据存到baseMap中

            if (temp.length == 3) {

                // 文件夹名类别名

                if (baseMap.containsKey(temp[0])) {

                    baseMap.get(temp[0]).put(temp[1], Integer.parseInt(temp[2]));

                } else {

                    Map<String, Integer> oneMap = new HashMap<String, Integer>();

                    oneMap.put(temp[1], Integer.parseInt(temp[2]));

                    baseMap.put(temp[0], oneMap);

                }

            } // 读取数据完毕，全部保存在baseMap中

            int allWordsInClass = 0;

            for (Map.Entry<String, Map<String, Integer>> entries : baseMap.entrySet()) { // 遍历类别

                allWordsInClass = filemap.get(entries.getKey());

                for (Map.Entry<String, Integer> entry : entries.getValue().entrySet()) { // 遍历类别中的单词词频求概率

                    double p = (entry.getValue() + 1.0) / (allWordsInClass + tot);

                    className.set(entries.getKey() + "\t" + entry.getKey());

                    number.set(p);

                    LOG.info("------>p = " + p);

                    mos.write(new Text(entry.getKey()), number, entries.getKey() /*+ "\\" + entries.getKey()*/);//最后一个参数是为了生成文件夹对应的文件

//                    context.write(className, number);

                }

            }

        }

        //最后计算类别中不存在单词的概率，每个类别都是一个常数

        protected void cleanup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)

                throws IOException, InterruptedException {

            Configuration conf = context.getConfiguration();

            int tot = Integer.parseInt(conf.get("TOTALWORDS"));

            for (Map.Entry<String, Integer> entry : filemap.entrySet()) { // 遍历类别

                double notFind =  (1.0) / (entry.getValue() + tot);

                number.set(notFind);

                mos.write(new Text(entry.getKey()), number, "_notFound" + "\\" +"notFound");

            }

            mos.close();

        }

        protected void setup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)

                throws IOException, InterruptedException {

            // TODO Auto-generated method stub

            Configuration conf = context.getConfiguration();

            mos = new MultipleOutputs<Text, DoubleWritable>(context);

            String filePath = conf.get("wordsInClassPath");

            FileSystem fs = FileSystem.get(URI.create(filePath), conf);

            FSDataInputStream inputStream = fs.open(new Path(filePath));

            BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));

            String strLine = null;

            while ((strLine = buffer.readLine()) != null) {

                String[] temp = strLine.split("\t");

                filemap.put(temp[0], Integer.parseInt(temp[1]));

            }

        }

    }

    // Reducer

    static class WordsOfClassCountReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {

        // result 表示每个文件里面单词个数

        DoubleWritable result = new DoubleWritable();

        // Configuration conf = new Configuration();

        // int total = conf.getInt("TOTALWORDS", 1);

        protected void reduce(Text key, Iterable<DoubleWritable> values,

                Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)

                        throws IOException, InterruptedException {

            double sum = 0L;

            for (DoubleWritable value : values) {

                sum += value.get();

            }

            result.set(sum);

            context.write(key, result);

        }

    }

}

预测

package org.wordCount;

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.net.URI;

import java.util.HashMap;

import java.util.Map;

import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.DoubleWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

public class Probability {

    private static final Log LOG = LogFactory.getLog(FileInputFormat.class);

    public static int total = 0;

    private static MultipleOutputs<Text, DoubleWritable> mos;

    // Client

    public static void run(Configuration conf) throws Exception {

        // 读取单词总数，设置到congfiguration中

        String totalWordsPath = conf.get("totalWordsPath");

//        String wordsInClassPath = conf.get("wordsInClassPath");

        // 先读取单词总类别数

        FileSystem fs = FileSystem.get(URI.create(totalWordsPath), conf);

        FSDataInputStream inputStream = fs.open(new Path(totalWordsPath));

        BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));

        String strLine = buffer.readLine();

        String[] temp = strLine.split(":");

        if (temp.length == 2) {

            // temp[0] = TOTALWORDS

            conf.set(temp[0], temp[1]);// 设置两个String

        }

        total = Integer.parseInt(conf.get("TOTALWORDS"));

        LOG.info("------>total = " + total);

        System.out.println("total ==== " + total);

        Job job = new Job(conf, "file count");

        job.setJarByClass(Probability.class);

        job.setMapperClass(WordsOfClassCountMapper.class);

        job.setReducerClass(WordsOfClassCountReducer.class);

        String input = conf.get("wordsOutput");

        String output = conf.get("freqOutput");

        FileInputFormat.addInputPath(job, new Path(input));

        FileOutputFormat.setOutputPath(job, new Path(output));

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(DoubleWritable.class);

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

    // Mapper

    static class WordsOfClassCountMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {

        private static DoubleWritable number = new DoubleWritable();

        private static Text className = new Text();

        // 保存类别中单词总数

        private static Map<String, Integer> filemap = new HashMap<String, Integer>();

        protected void map(LongWritable key, Text value,

                Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)

                        throws IOException, InterruptedException {

            Configuration conf = context.getConfiguration();

            int tot = Integer.parseInt(conf.get("TOTALWORDS"));

            System.out.println("total = " + total);

            System.out.println("tot = " + tot);

            // 输入的格式如下：

            // ALB weekend 1

            // ALB weeks 3

            Map<String, Map<String, Integer>> baseMap = new HashMap<String, Map<String, Integer>>(); // 保存基础数据

            // Map<String, Map<String, Double>> priorMap = new HashMap<String,

            // Map<String, Double>>(); // 保存每个单词出现的概率

            String[] temp = value.toString().split("\t");

            // 先将数据存到baseMap中

            if (temp.length == 3) {

                // 文件夹名类别名

                if (baseMap.containsKey(temp[0])) {

                    baseMap.get(temp[0]).put(temp[1], Integer.parseInt(temp[2]));

                } else {

                    Map<String, Integer> oneMap = new HashMap<String, Integer>();

                    oneMap.put(temp[1], Integer.parseInt(temp[2]));

                    baseMap.put(temp[0], oneMap);

                }

            } // 读取数据完毕，全部保存在baseMap中

            int allWordsInClass = 0;

            for (Map.Entry<String, Map<String, Integer>> entries : baseMap.entrySet()) { // 遍历类别

                allWordsInClass = filemap.get(entries.getKey());

                for (Map.Entry<String, Integer> entry : entries.getValue().entrySet()) { // 遍历类别中的单词词频求概率

                    double p = (entry.getValue() + 1.0) / (allWordsInClass + tot);

                    className.set(entries.getKey() + "\t" + entry.getKey());

                    number.set(p);

                    LOG.info("------>p = " + p);

                    mos.write(new Text(entry.getKey()), number, entries.getKey() /*+ "\\" + entries.getKey()*/);//最后一个参数是为了生成文件夹对应的文件

//                    context.write(className, number);

                }

            }

        }

        //最后计算类别中不存在单词的概率，每个类别都是一个常数

        protected void cleanup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)

                throws IOException, InterruptedException {

            Configuration conf = context.getConfiguration();

            int tot = Integer.parseInt(conf.get("TOTALWORDS"));

            for (Map.Entry<String, Integer> entry : filemap.entrySet()) { // 遍历类别

                double notFind =  (1.0) / (entry.getValue() + tot);

                number.set(notFind);

                mos.write(new Text(entry.getKey()), number, "_notFound" + "\\" +"notFound");

            }

            mos.close();

        }

        protected void setup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)

                throws IOException, InterruptedException {

            // TODO Auto-generated method stub

            Configuration conf = context.getConfiguration();

            mos = new MultipleOutputs<Text, DoubleWritable>(context);

            String filePath = conf.get("wordsInClassPath");

            FileSystem fs = FileSystem.get(URI.create(filePath), conf);

            FSDataInputStream inputStream = fs.open(new Path(filePath));

            BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));

            String strLine = null;

            while ((strLine = buffer.readLine()) != null) {

                String[] temp = strLine.split("\t");

                filemap.put(temp[0], Integer.parseInt(temp[1]));

            }

        }

    }

    // Reducer

    static class WordsOfClassCountReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {

        // result 表示每个文件里面单词个数

        DoubleWritable result = new DoubleWritable();

        // Configuration conf = new Configuration();

        // int total = conf.getInt("TOTALWORDS", 1);

        protected void reduce(Text key, Iterable<DoubleWritable> values,

                Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)

                        throws IOException, InterruptedException {

            double sum = 0L;

            for (DoubleWritable value : values) {

                sum += value.get();

            }

            result.set(sum);

            context.write(key, result);

        }

    }

}

预测的inputformat

package org.wordCount.predict;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IOUtils;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.InputSplit;

import org.apache.hadoop.mapreduce.JobContext;

import org.apache.hadoop.mapreduce.RecordReader;

import org.apache.hadoop.mapreduce.TaskAttemptContext;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class WholeFileInputFormat extends FileInputFormat<LongWritable, Text>{

    @Override

    public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)

            throws IOException, InterruptedException {

        WholeFileRecordReader reader = new WholeFileRecordReader();

        reader.initialize(split, context);

        return reader;

    }

    @Override

    protected boolean isSplitable(JobContext context, Path filename) {

        return false;

    }

}

class WholeFileRecordReader extends RecordReader<LongWritable, Text>{

    private FileSplit fileSplit;    //保存输入的分片，他将被转换成一条<key, value>记录

    private Configuration conf;        //配置对象

    private Text value = new Text();//

    private LongWritable key = new LongWritable();    //key对象，为空

    private boolean processed = false;    //布尔变量记录记录是否被处理过

    @Override

    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {

        this.fileSplit = (FileSplit)split;        //将输入分片强制转换成fileSplit

        this.conf = context.getConfiguration();

    }

    @Override

    public boolean nextKeyValue() throws IOException, InterruptedException {

        if(!processed){

            byte[] contents = new byte[(int)fileSplit.getLength()];

            Path file = fileSplit.getPath();

            FileSystem fs = file.getFileSystem(conf);

            FSDataInputStream in = null;

            try{

                in = fs.open(file);

                IOUtils.readFully(in, contents, 0, contents.length);

                value.set(contents, 0, contents.length);

            }finally{

                IOUtils.closeStream(in);

            }

            processed = true;

            return true;

        }

        return false;

    }

    @Override

    public LongWritable getCurrentKey() throws IOException, InterruptedException {

        return key;

    }

    @Override

    public Text getCurrentValue() throws IOException, InterruptedException {

        // TODO Auto-generated method stub

        return value;

    }

    @Override

    public float getProgress() throws IOException, InterruptedException {

        return processed ? 1.0f : 0.0f;

    }

    @Override

    public void close() throws IOException {

        // TODO Auto-generated method stub

    }

}

Hadoop 文本分类终于跑通了的更多相关文章

终于跑通分布式事务框架tcc-transaction的示例项目
1.背景前段时间在看项目代码的时候,发现有些接口的流程比较长,在各个服务里面都有通过数据库事务保证数据的一致性,但是在上游的controller层并没有对一致性做保证. 网上查了下,还没找到基于Go ...
如何跑通第一个 SQL 作业
简介: 本文由阿里巴巴技术专家周凯波(宝牛)分享,主要介绍如何跑通第一个SQL. 一.SQL的基本概念 1.SQL 分类 SQL分为四类,分别是数据查询语言(DQL).数据操纵语言(DML).数据定义 ...
如何在Windows下用cpu模式跑通py-faster-rcnn 的demo.py
关键字:Windows.cpu模式.Python.faster-rcnn.demo.py 声明:本篇blog暂时未经二次实践验证,主要以本人第一次配置过程的经验写成.计划在7月底回家去电脑城借台机子试 ...
文本分类学习（五）机器学习SVM的前奏-特征提取（卡方检验续集）
前言: 上一篇比较详细的介绍了卡方检验和卡方分布.这篇我们就实际操刀,找到一些训练集,正所谓纸上得来终觉浅,绝知此事要躬行.然而我在躬行的时候,发现了卡方检验对于文本分类来说应该把公式再变形一般,那样 ...
NLP系列(2)_用朴素贝叶斯进行文本分类(上)
作者:龙心尘 && 寒小阳时间:2016年1月. 出处: http://blog.csdn.net/longxinchen_ml/article/details/50597149 h ...
百度EasyDL文本分类自定义API示例代码 python
因为需要将命名实体中的组织机构名进一步区分为政府.企业.社会组织等,在easydl上做了一个文本分类模型,但是要用这个接口时候发现, 官方文档中竟然还在用urllib2的库,且不完整.好多地方会报错, ...
文本分类学习（十）构造机器学习Libsvm 的C# wrapper（调用c/c++动态链接库）
前言: 对于SVM的了解,看前辈写的博客加上读论文对于SVM的皮毛知识总算有点了解,比如线性分类器,和求凸二次规划中用到的高等数学知识.然而SVM最核心的地方应该在于核函数和求关于α函数的极值的方法: ...
文本分类需要CNN？No！fastText完美解决你的需求（前篇）
http://blog.csdn.net/weixin_36604953/article/details/78195462?locationNum=8&fps=1 文本分类需要CNN?No!f ...
师傅领进门之6步教你跑通一个AI程序！
欢迎大家前往腾讯云+社区,获取更多腾讯海量技术实践干货哦~ 本文由云计算基础发表于云+社区专栏源码下载地址请点击原文查看. 初学机器学习,写篇文章mark一下,希望能为将入坑者解点惑.本文介绍一些机 ...

随机推荐

4-static（静态变量）关键字
1.使用static声明属性 static声明全局属性2.使用static声明方法直接通过类名调用3.注意点使用static方法的时候,只能访问static声明的属性和方法,而非static声明的 ...
SCANF SCANF_S
今天在看C的教程的时候,用VS2013写了一小段代码 scanf("%f",&w); 提示需要在预编译器里添加 _CRT_SECURE_NO_WARNINGS, 百度了下 ...
vhost.sh 源代码，认真看哦
#!/bin/bash # # Web Server Install Script # Created by wdlinux QQ:12571192 # Url:http://www.wdlinux. ...
自动加载U盘
编辑/etc/fstab 比如想在开机的时候将/dev/sda1安装在/mnt 可以在/etc/fstab中加入一行 /dev/sda1 /mnt ext3 defaults 0 ...
EntityFramework日志记录
首先在应用启动时执行:DbInterception.Add(new LogFormatter()); 然后加入如下类: #region [ EF的数据库执行日志记录 ] public class ...
C#常用集合的使用
大多数集合都在System.Collections,System.Collections.Generic两个命名空间.其中System.Collections.Generic专门用于泛型集合. 针对特 ...
二分三角形的时候尤其需要注意！！！ HDU 5115 二分+模拟
题目大意:http://blog.csdn.net/snowy_smile/article/details/49535301 思路:分类讨论,分别在[1,2].(2,3).[3,4).[4,1]相遇, ...
SQL 比较中文字符串
/* declare @str1 varchar(200),@str2 varchar(200) set @str1=N'江西省南昌市其它区高新区火炬大道809号' set @str2=N'江西省南昌 ...
第一次安装ubuntu要设置的东西
1. 安装网卡驱动 lscpi 查看网卡型号根据型号找到驱动源码下载下来并编译安装 2. 编译安卓源码的时候出现jdk型号不对的情况把/usr/bin/java 删除,就可以了.
mysql 连接两列
以下划线符号,连接两列,作为查询结果: SELECT CONCAT(col_1,'_',col_2) FROM yourtable

Hadoop 文本分类 终于跑通了

Hadoop 文本分类 终于跑通了的更多相关文章

随机推荐

热门专题

Hadoop 文本分类终于跑通了

Hadoop 文本分类终于跑通了的更多相关文章