Training

入口

package org.wordCount;

import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; public class WordMain { // private static List<String> secondDir = new ArrayList<String>(); public static void main(String[] args) throws Exception { Configuration conf = new Configuration();
//下面两行很重要
conf.set("mapred.jar", "E://eclipse//jar-work//WordMain.jar");
conf.set("mapred.job.tracker", "192.168.190.128:9001"); //设置单词先验概率的保存路径
String priorProbality = "hdfs://192.168.190.128:9000/user/hadoop/output/priorP/priorProbability.txt";
conf.set("priorProbality", priorProbality); //单词总种类数的保存路径
String totalWordsPath = "hdfs://192.168.190.128:9000/user/hadoop/output/totalwords.txt";
conf.set("totalWordsPath", totalWordsPath); //每个类别中单词总数
String wordsInClassPath = "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsFrequence/_wordsInClass/wordsInClass-r-00000";
conf.set("wordsInClassPath", wordsInClassPath); //设置输入 和 单词词频的输出路径
// "/user/hadoop/input/NBCorpus/Country"
String input = "hdfs://192.168.190.128:9000/user/hadoop/input/NBCorpus/Country";
String wordsOutput = "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsFrequence";
conf.set("input", input);
conf.set("wordsOutput", wordsOutput); //每个类别单词概率保存路径,
//单词词频的输入路径也就是单词词频的输出路径 String freqOutput = "hdfs://192.168.190.128:9000/user/hadoop/output/probability/";
conf.set("freqOutput", freqOutput); FileCount.run(conf);
WordCount.run(conf);
Probability.run(conf);
/*
System.out.print("----------"); String[] otherArgs = new String[] { "hdfs://192.168.190.128:9000/user/hadoop/test/",
"hdfs://192.168.190.128:9000/user/hadoop/wordcount/output2/" };
conf.set("mapred.jar", "E://eclipse//jar-work//WordMain.jar"); Job job = new Job(conf, "word count");
job.setJarByClass(WordMain.class); job.setInputFormatClass(MyInputFormat.class); job.setMapperClass(WordMapper.class);
// job.setCombinerClass(WordReducer.class);
job.setReducerClass(WordReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// MyUtils.addInputPath(job, new Path(otherArgs[0]), conf); List<Path> inputPaths = getSecondDir(conf, otherArgs[0]);
for (Path path : inputPaths) {
System.out.println("path = " + path.toString());
MyInputFormat.addInputPath(job, path); }
System.out.println("addinputpath ok" );
// FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1);*/ } // 获取文件夹下面二级文件夹路径的方法
static List<Path> getSecondDir(Configuration conf, String folder) throws Exception {
Path path = new Path(folder);
FileSystem fs = path.getFileSystem(conf);
FileStatus[] stats = fs.listStatus(path);
List<Path> folderPath = new ArrayList<Path>();
for (FileStatus stat : stats) {
if (stat.isDir()) {
if (fs.listStatus(stat.getPath()).length > 10) { // 筛选出文件数大于10个的类别作为
// 输入路径
folderPath.add(stat.getPath());
}
}
}
return folderPath;
} }

统计各个类别文本数

package org.wordCount;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils; /**
*
* 获取文件个数,并计算先验概率 先验概率保存在/user/hadoop/output/priorP/prior.txt
*
*/ public class FileCount { public static void run(Configuration conf) throws Exception { int sum = 0;
String in = conf.get("input"); Map<String, Integer> map = new HashMap<>();
Map<String, Double> priorMap = new HashMap<>(); // map传值(需要筛选测试集,有的类别文本数太少要删除)
map = FileCount.getFileNumber(in); //测试打印出每个类别和文件总数
Iterator<Map.Entry<String, Integer>> itrs = map.entrySet().iterator();
while (itrs.hasNext()) {
// System.out.println("ok");
Map.Entry<String, Integer> it = itrs.next();
if(it.getValue() <= 10){ //这两行代码可以不计算文本数少于10的类别
itrs.remove();
}else{
sum += it.getValue();
System.out.println(it.getKey() + "\t" + it.getValue());
}
} System.out.println("sum = " + sum); String output = conf.get("priorProbality"); Path outputPath = new Path(output);
FileSystem fs = outputPath.getFileSystem(conf);
FSDataOutputStream outputStream = fs.create(outputPath); //计算每个类别文本占总文本的比率,即先验概率
String ctx = "";
for (Map.Entry<String, Integer> entry : map.entrySet()) {
Double result = 0.0;
result = Double.parseDouble(entry.getValue().toString()) / sum;
priorMap.put(entry.getKey(), result);//保存在priorMap中
ctx += entry.getKey() + "\t" + result + "\n";
}
outputStream.writeBytes(ctx);
IOUtils.closeStream(outputStream); // 打印概率信息,同时可以写入文件中
// map的另外一种遍历方法
Iterator<Map.Entry<String, Double>> iterators = priorMap.entrySet().iterator();
while (iterators.hasNext()) {
Map.Entry<String, Double> iterator = iterators.next();
System.out.println(iterator.getKey() + "\t" + iterator.getValue());
} } // get 方法
public static Map<String, Integer> getFileNumber(String folderPath) throws Exception { Map<String, Integer> fileMap = new HashMap<>();
Configuration conf = new Configuration(); Path path = new Path(folderPath);
FileSystem hdfs = path.getFileSystem(conf);
FileStatus[] status = hdfs.listStatus(path);
// System.out.println(folderPath);
// System.out.println("status.length = " + status.length); for (FileStatus stat : status) {
if (stat.isDir()) {
int length = hdfs.listStatus(stat.getPath()).length;
String name = stat.getPath().getName();
fileMap.put(name, length);
}
} return fileMap;
} }

文本中单词计数

package org.wordCount;

import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; public class WordCount { private static MultipleOutputs<Text, IntWritable> mos;
// static String baseOutputPath = "/user/hadoop/test_out"; // 设计两个map分别计算每个类别的文本数//和每个类别的单词总数
// private static Map<String, List<String>> fileCountMap = new
// HashMap<String, List<String>>();
// private static Map<String, Integer> fileCount = new HashMap<String,
// Integer>();
// static Map<String, List<String>> wordsCountInClassMap = new
// HashMap<String, List<String>>(); static enum WordsNature {
CLSASS_NUMBER, CLASS_WORDS, TOTALWORDS
} // map
static class First_Mapper extends Mapper<Text, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1);
private final static IntWritable zero = new IntWritable(0); private Text countryName = new Text(); @Override
protected void map(Text key, Text value, Mapper<Text, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
String word = itr.nextToken();
if (!(MyUtils.hasDigit(word) || word.contains("."))) { // 去掉无意义词
countryName.set(key.toString() + "\t" + word); context.write(countryName, one); // 统计每个类别中的单词个数 ABL have 1
context.write(key, one); // 统计类别中的单词总数
context.write(new Text(word), zero); // 统计单词总数
}
} }
} // Reducer
static class First_Reducer extends Reducer<Text, IntWritable, Text, IntWritable> { // result 表示每个类别中每个单词的个数
IntWritable result = new IntWritable();
Map<String, List<String>> classMap = new HashMap<String, List<String>>();
Map<String, List<String>> fileMap = new HashMap<String, List<String>>(); @Override
protected void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
} // sum为0,总得单词数加1,统计所有单词的种类
if (sum == 0) {
context.getCounter(WordsNature.TOTALWORDS).increment(1);
} else {// sum不为0时,通过key的长度来判断,
String[] temp = key.toString().split("\t");
if (temp.length == 2) { // 用tab分隔类别和单词
result.set(sum);
context.write(key, result);
// mos.write(new Text(temp[1]), result, temp[0]);
} else { // 类别中单词总数
result.set(sum);
mos.write(key, result, "_wordsInClass" + "\\" + "wordsInClass");
} } } @Override
protected void cleanup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)
throws IOException, InterruptedException { mos.close();
} @Override
protected void setup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)
throws IOException, InterruptedException { mos = new MultipleOutputs<Text, IntWritable>(context);
} } public static int run(Configuration conf) throws Exception {
// Configuration conf = new Configuration();
// System.out.print("---run-------");
// 设置不同文件的路径
// 文本数路径
// String priorProbality = "hdfs://192.168.190.128:9000/user/hadoop/output/priorP/priorProbality.txt";
// conf.set("priorProbality", priorProbality); Job job = new Job(conf, "file count"); job.setJarByClass(WordCount.class); job.setInputFormatClass(MyInputFormat.class); job.setMapperClass(WordCount.First_Mapper.class);
job.setReducerClass(WordCount.First_Reducer.class);
// System.out.println("---job-------");
// 过滤掉文本数少于10的类别 String input = conf.get("input"); List<Path> inputPaths = MyUtils.getSecondDir(conf, input);
for (Path path : inputPaths) {
System.out.println("path = " + path.toString());
MyInputFormat.addInputPath(job, path);
} String wordsOutput = conf.get("wordsOutput");
FileOutputFormat.setOutputPath(job, new Path(wordsOutput)); job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); int exitCode = job.waitForCompletion(true) ? 0 : 1; // 调用计数器
Counters counters = job.getCounters();
Counter c1 = counters.findCounter(WordsNature.TOTALWORDS);
System.out.println("-------------->>>>: " + c1.getDisplayName() + ":" + c1.getName() + ": " + c1.getValue()); // 将单词种类数写入文件中
Path totalWordsPath = new Path("hdfs://192.168.190.128:9000/user/hadoop/output/totalwords.txt");
FileSystem fs = totalWordsPath.getFileSystem(conf);
FSDataOutputStream outputStream = fs.create(totalWordsPath);
outputStream.writeBytes(c1.getDisplayName() + ":" + c1.getValue());
IOUtils.closeStream(outputStream); // 下次求概率是尝试单词总种类数写到configuration中
//
// conf.set("TOTALWORDS", totalWords.toString()); return exitCode; } }

MyInputFormat

package org.wordCount;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; public class Probability { private static final Log LOG = LogFactory.getLog(FileInputFormat.class);
public static int total = 0;
private static MultipleOutputs<Text, DoubleWritable> mos; // Client
public static void run(Configuration conf) throws Exception { // 读取单词总数,设置到congfiguration中
String totalWordsPath = conf.get("totalWordsPath");
// String wordsInClassPath = conf.get("wordsInClassPath"); // 先读取单词总类别数
FileSystem fs = FileSystem.get(URI.create(totalWordsPath), conf);
FSDataInputStream inputStream = fs.open(new Path(totalWordsPath));
BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));
String strLine = buffer.readLine();
String[] temp = strLine.split(":");
if (temp.length == 2) {
// temp[0] = TOTALWORDS
conf.set(temp[0], temp[1]);// 设置两个String
} total = Integer.parseInt(conf.get("TOTALWORDS"));
LOG.info("------>total = " + total); System.out.println("total ==== " + total); Job job = new Job(conf, "file count"); job.setJarByClass(Probability.class); job.setMapperClass(WordsOfClassCountMapper.class);
job.setReducerClass(WordsOfClassCountReducer.class); String input = conf.get("wordsOutput");
String output = conf.get("freqOutput"); FileInputFormat.addInputPath(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output)); job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class); System.exit(job.waitForCompletion(true) ? 0 : 1); } // Mapper
static class WordsOfClassCountMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> { private static DoubleWritable number = new DoubleWritable();
private static Text className = new Text(); // 保存类别中单词总数
private static Map<String, Integer> filemap = new HashMap<String, Integer>(); protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
int tot = Integer.parseInt(conf.get("TOTALWORDS")); System.out.println("total = " + total);
System.out.println("tot = " + tot); // 输入的格式如下:
// ALB weekend 1
// ALB weeks 3
Map<String, Map<String, Integer>> baseMap = new HashMap<String, Map<String, Integer>>(); // 保存基础数据
// Map<String, Map<String, Double>> priorMap = new HashMap<String,
// Map<String, Double>>(); // 保存每个单词出现的概率 String[] temp = value.toString().split("\t");
// 先将数据存到baseMap中
if (temp.length == 3) {
// 文件夹名类别名
if (baseMap.containsKey(temp[0])) {
baseMap.get(temp[0]).put(temp[1], Integer.parseInt(temp[2]));
} else {
Map<String, Integer> oneMap = new HashMap<String, Integer>();
oneMap.put(temp[1], Integer.parseInt(temp[2]));
baseMap.put(temp[0], oneMap);
} } // 读取数据完毕,全部保存在baseMap中 int allWordsInClass = 0; for (Map.Entry<String, Map<String, Integer>> entries : baseMap.entrySet()) { // 遍历类别
allWordsInClass = filemap.get(entries.getKey());
for (Map.Entry<String, Integer> entry : entries.getValue().entrySet()) { // 遍历类别中的单词词频求概率
double p = (entry.getValue() + 1.0) / (allWordsInClass + tot); className.set(entries.getKey() + "\t" + entry.getKey());
number.set(p);
LOG.info("------>p = " + p);
mos.write(new Text(entry.getKey()), number, entries.getKey() /*+ "\\" + entries.getKey()*/);//最后一个参数是为了生成文件夹对应的文件 // context.write(className, number);
}
} } //最后计算类别中不存在单词的概率,每个类别都是一个常数
protected void cleanup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException { Configuration conf = context.getConfiguration();
int tot = Integer.parseInt(conf.get("TOTALWORDS"));
for (Map.Entry<String, Integer> entry : filemap.entrySet()) { // 遍历类别 double notFind = (1.0) / (entry.getValue() + tot);
number.set(notFind);
mos.write(new Text(entry.getKey()), number, "_notFound" + "\\" +"notFound"); }
mos.close();
} protected void setup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
Configuration conf = context.getConfiguration();
mos = new MultipleOutputs<Text, DoubleWritable>(context);
String filePath = conf.get("wordsInClassPath");
FileSystem fs = FileSystem.get(URI.create(filePath), conf);
FSDataInputStream inputStream = fs.open(new Path(filePath));
BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));
String strLine = null;
while ((strLine = buffer.readLine()) != null) {
String[] temp = strLine.split("\t");
filemap.put(temp[0], Integer.parseInt(temp[1]));
}
} } // Reducer
static class WordsOfClassCountReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> { // result 表示每个文件里面单词个数
DoubleWritable result = new DoubleWritable();
// Configuration conf = new Configuration();
// int total = conf.getInt("TOTALWORDS", 1); protected void reduce(Text key, Iterable<DoubleWritable> values,
Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException { double sum = 0L;
for (DoubleWritable value : values) {
sum += value.get();
}
result.set(sum); context.write(key, result);
} } }

两个小工具

package org.wordCount;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; public class MyUtils { // 获取文件夹下面二级文件夹路径的方法
static List<Path> getSecondDir(Configuration conf, String folder) throws Exception {
// System.out.println("----getSencondDir----" + folder);
Path path = new Path(folder); FileSystem fs = path.getFileSystem(conf);
FileStatus[] stats = fs.listStatus(path);
System.out.println("stats.length = " + stats.length);
List<Path> folderPath = new ArrayList<Path>();
for (FileStatus stat : stats) {
if (stat.isDir()) {
// System.out.println("----stat----" + stat.getPath());
if (fs.listStatus(stat.getPath()).length > 10) { // 筛选出文件数大于10个的类别作为
// 输入路径
folderPath.add(stat.getPath());
}
}
}
// System.out.println("----folderPath----" + folderPath.size());
return folderPath;
} // 判断一个字符串是否含有数字
static boolean hasDigit(String content) { boolean flag = false; Pattern p = Pattern.compile(".*\\d+.*"); Matcher m = p.matcher(content); if (m.matches()) flag = true; return flag; } }

计算概率

package org.wordCount;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; public class Probability { private static final Log LOG = LogFactory.getLog(FileInputFormat.class);
public static int total = 0;
private static MultipleOutputs<Text, DoubleWritable> mos; // Client
public static void run(Configuration conf) throws Exception { // 读取单词总数,设置到congfiguration中
String totalWordsPath = conf.get("totalWordsPath");
// String wordsInClassPath = conf.get("wordsInClassPath"); // 先读取单词总类别数
FileSystem fs = FileSystem.get(URI.create(totalWordsPath), conf);
FSDataInputStream inputStream = fs.open(new Path(totalWordsPath));
BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));
String strLine = buffer.readLine();
String[] temp = strLine.split(":");
if (temp.length == 2) {
// temp[0] = TOTALWORDS
conf.set(temp[0], temp[1]);// 设置两个String
} total = Integer.parseInt(conf.get("TOTALWORDS"));
LOG.info("------>total = " + total); System.out.println("total ==== " + total); Job job = new Job(conf, "file count"); job.setJarByClass(Probability.class); job.setMapperClass(WordsOfClassCountMapper.class);
job.setReducerClass(WordsOfClassCountReducer.class); String input = conf.get("wordsOutput");
String output = conf.get("freqOutput"); FileInputFormat.addInputPath(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output)); job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class); System.exit(job.waitForCompletion(true) ? 0 : 1); } // Mapper
static class WordsOfClassCountMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> { private static DoubleWritable number = new DoubleWritable();
private static Text className = new Text(); // 保存类别中单词总数
private static Map<String, Integer> filemap = new HashMap<String, Integer>(); protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
int tot = Integer.parseInt(conf.get("TOTALWORDS")); System.out.println("total = " + total);
System.out.println("tot = " + tot); // 输入的格式如下:
// ALB weekend 1
// ALB weeks 3
Map<String, Map<String, Integer>> baseMap = new HashMap<String, Map<String, Integer>>(); // 保存基础数据
// Map<String, Map<String, Double>> priorMap = new HashMap<String,
// Map<String, Double>>(); // 保存每个单词出现的概率 String[] temp = value.toString().split("\t");
// 先将数据存到baseMap中
if (temp.length == 3) {
// 文件夹名类别名
if (baseMap.containsKey(temp[0])) {
baseMap.get(temp[0]).put(temp[1], Integer.parseInt(temp[2]));
} else {
Map<String, Integer> oneMap = new HashMap<String, Integer>();
oneMap.put(temp[1], Integer.parseInt(temp[2]));
baseMap.put(temp[0], oneMap);
} } // 读取数据完毕,全部保存在baseMap中 int allWordsInClass = 0; for (Map.Entry<String, Map<String, Integer>> entries : baseMap.entrySet()) { // 遍历类别
allWordsInClass = filemap.get(entries.getKey());
for (Map.Entry<String, Integer> entry : entries.getValue().entrySet()) { // 遍历类别中的单词词频求概率
double p = (entry.getValue() + 1.0) / (allWordsInClass + tot); className.set(entries.getKey() + "\t" + entry.getKey());
number.set(p);
LOG.info("------>p = " + p);
mos.write(new Text(entry.getKey()), number, entries.getKey() /*+ "\\" + entries.getKey()*/);//最后一个参数是为了生成文件夹对应的文件 // context.write(className, number);
}
} } //最后计算类别中不存在单词的概率,每个类别都是一个常数
protected void cleanup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException { Configuration conf = context.getConfiguration();
int tot = Integer.parseInt(conf.get("TOTALWORDS"));
for (Map.Entry<String, Integer> entry : filemap.entrySet()) { // 遍历类别 double notFind = (1.0) / (entry.getValue() + tot);
number.set(notFind);
mos.write(new Text(entry.getKey()), number, "_notFound" + "\\" +"notFound"); }
mos.close();
} protected void setup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
Configuration conf = context.getConfiguration();
mos = new MultipleOutputs<Text, DoubleWritable>(context);
String filePath = conf.get("wordsInClassPath");
FileSystem fs = FileSystem.get(URI.create(filePath), conf);
FSDataInputStream inputStream = fs.open(new Path(filePath));
BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));
String strLine = null;
while ((strLine = buffer.readLine()) != null) {
String[] temp = strLine.split("\t");
filemap.put(temp[0], Integer.parseInt(temp[1]));
}
} } // Reducer
static class WordsOfClassCountReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> { // result 表示每个文件里面单词个数
DoubleWritable result = new DoubleWritable();
// Configuration conf = new Configuration();
// int total = conf.getInt("TOTALWORDS", 1); protected void reduce(Text key, Iterable<DoubleWritable> values,
Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException { double sum = 0L;
for (DoubleWritable value : values) {
sum += value.get();
}
result.set(sum); context.write(key, result);
} } }

预测

package org.wordCount;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; public class Probability { private static final Log LOG = LogFactory.getLog(FileInputFormat.class);
public static int total = 0;
private static MultipleOutputs<Text, DoubleWritable> mos; // Client
public static void run(Configuration conf) throws Exception { // 读取单词总数,设置到congfiguration中
String totalWordsPath = conf.get("totalWordsPath");
// String wordsInClassPath = conf.get("wordsInClassPath"); // 先读取单词总类别数
FileSystem fs = FileSystem.get(URI.create(totalWordsPath), conf);
FSDataInputStream inputStream = fs.open(new Path(totalWordsPath));
BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));
String strLine = buffer.readLine();
String[] temp = strLine.split(":");
if (temp.length == 2) {
// temp[0] = TOTALWORDS
conf.set(temp[0], temp[1]);// 设置两个String
} total = Integer.parseInt(conf.get("TOTALWORDS"));
LOG.info("------>total = " + total); System.out.println("total ==== " + total); Job job = new Job(conf, "file count"); job.setJarByClass(Probability.class); job.setMapperClass(WordsOfClassCountMapper.class);
job.setReducerClass(WordsOfClassCountReducer.class); String input = conf.get("wordsOutput");
String output = conf.get("freqOutput"); FileInputFormat.addInputPath(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output)); job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class); System.exit(job.waitForCompletion(true) ? 0 : 1); } // Mapper
static class WordsOfClassCountMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> { private static DoubleWritable number = new DoubleWritable();
private static Text className = new Text(); // 保存类别中单词总数
private static Map<String, Integer> filemap = new HashMap<String, Integer>(); protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
int tot = Integer.parseInt(conf.get("TOTALWORDS")); System.out.println("total = " + total);
System.out.println("tot = " + tot); // 输入的格式如下:
// ALB weekend 1
// ALB weeks 3
Map<String, Map<String, Integer>> baseMap = new HashMap<String, Map<String, Integer>>(); // 保存基础数据
// Map<String, Map<String, Double>> priorMap = new HashMap<String,
// Map<String, Double>>(); // 保存每个单词出现的概率 String[] temp = value.toString().split("\t");
// 先将数据存到baseMap中
if (temp.length == 3) {
// 文件夹名类别名
if (baseMap.containsKey(temp[0])) {
baseMap.get(temp[0]).put(temp[1], Integer.parseInt(temp[2]));
} else {
Map<String, Integer> oneMap = new HashMap<String, Integer>();
oneMap.put(temp[1], Integer.parseInt(temp[2]));
baseMap.put(temp[0], oneMap);
} } // 读取数据完毕,全部保存在baseMap中 int allWordsInClass = 0; for (Map.Entry<String, Map<String, Integer>> entries : baseMap.entrySet()) { // 遍历类别
allWordsInClass = filemap.get(entries.getKey());
for (Map.Entry<String, Integer> entry : entries.getValue().entrySet()) { // 遍历类别中的单词词频求概率
double p = (entry.getValue() + 1.0) / (allWordsInClass + tot); className.set(entries.getKey() + "\t" + entry.getKey());
number.set(p);
LOG.info("------>p = " + p);
mos.write(new Text(entry.getKey()), number, entries.getKey() /*+ "\\" + entries.getKey()*/);//最后一个参数是为了生成文件夹对应的文件 // context.write(className, number);
}
} } //最后计算类别中不存在单词的概率,每个类别都是一个常数
protected void cleanup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException { Configuration conf = context.getConfiguration();
int tot = Integer.parseInt(conf.get("TOTALWORDS"));
for (Map.Entry<String, Integer> entry : filemap.entrySet()) { // 遍历类别 double notFind = (1.0) / (entry.getValue() + tot);
number.set(notFind);
mos.write(new Text(entry.getKey()), number, "_notFound" + "\\" +"notFound"); }
mos.close();
} protected void setup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
Configuration conf = context.getConfiguration();
mos = new MultipleOutputs<Text, DoubleWritable>(context);
String filePath = conf.get("wordsInClassPath");
FileSystem fs = FileSystem.get(URI.create(filePath), conf);
FSDataInputStream inputStream = fs.open(new Path(filePath));
BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));
String strLine = null;
while ((strLine = buffer.readLine()) != null) {
String[] temp = strLine.split("\t");
filemap.put(temp[0], Integer.parseInt(temp[1]));
}
} } // Reducer
static class WordsOfClassCountReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> { // result 表示每个文件里面单词个数
DoubleWritable result = new DoubleWritable();
// Configuration conf = new Configuration();
// int total = conf.getInt("TOTALWORDS", 1); protected void reduce(Text key, Iterable<DoubleWritable> values,
Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException { double sum = 0L;
for (DoubleWritable value : values) {
sum += value.get();
}
result.set(sum); context.write(key, result);
} } }

预测的inputformat

package org.wordCount.predict;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit; public class WholeFileInputFormat extends FileInputFormat<LongWritable, Text>{ @Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
WholeFileRecordReader reader = new WholeFileRecordReader();
reader.initialize(split, context);
return reader;
} @Override
protected boolean isSplitable(JobContext context, Path filename) { return false;
} } class WholeFileRecordReader extends RecordReader<LongWritable, Text>{ private FileSplit fileSplit; //保存输入的分片,他将被转换成一条<key, value>记录
private Configuration conf; //配置对象
private Text value = new Text();//
private LongWritable key = new LongWritable(); //key对象,为空
private boolean processed = false; //布尔变量记录记录是否被处理过 @Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
this.fileSplit = (FileSplit)split; //将输入分片强制转换成fileSplit
this.conf = context.getConfiguration(); } @Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if(!processed){
byte[] contents = new byte[(int)fileSplit.getLength()];
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(conf);
FSDataInputStream in = null;
try{
in = fs.open(file);
IOUtils.readFully(in, contents, 0, contents.length);
value.set(contents, 0, contents.length);
}finally{
IOUtils.closeStream(in);
}
processed = true;
return true;
}
return false;
} @Override
public LongWritable getCurrentKey() throws IOException, InterruptedException { return key;
} @Override
public Text getCurrentValue() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return value;
} @Override
public float getProgress() throws IOException, InterruptedException { return processed ? 1.0f : 0.0f;
} @Override
public void close() throws IOException {
// TODO Auto-generated method stub } }

Hadoop 文本分类 终于跑通了的更多相关文章

  1. 终于跑通分布式事务框架tcc-transaction的示例项目

    1.背景 前段时间在看项目代码的时候,发现有些接口的流程比较长,在各个服务里面都有通过数据库事务保证数据的一致性,但是在上游的controller层并没有对一致性做保证. 网上查了下,还没找到基于Go ...

  2. 如何跑通第一个 SQL 作业

    简介: 本文由阿里巴巴技术专家周凯波(宝牛)分享,主要介绍如何跑通第一个SQL. 一.SQL的基本概念 1.SQL 分类 SQL分为四类,分别是数据查询语言(DQL).数据操纵语言(DML).数据定义 ...

  3. 如何在Windows下用cpu模式跑通py-faster-rcnn 的demo.py

    关键字:Windows.cpu模式.Python.faster-rcnn.demo.py 声明:本篇blog暂时未经二次实践验证,主要以本人第一次配置过程的经验写成.计划在7月底回家去电脑城借台机子试 ...

  4. 文本分类学习 (五) 机器学习SVM的前奏-特征提取(卡方检验续集)

    前言: 上一篇比较详细的介绍了卡方检验和卡方分布.这篇我们就实际操刀,找到一些训练集,正所谓纸上得来终觉浅,绝知此事要躬行.然而我在躬行的时候,发现了卡方检验对于文本分类来说应该把公式再变形一般,那样 ...

  5. NLP系列(2)_用朴素贝叶斯进行文本分类(上)

    作者:龙心尘 && 寒小阳 时间:2016年1月. 出处: http://blog.csdn.net/longxinchen_ml/article/details/50597149 h ...

  6. 百度EasyDL文本分类自定义API示例代码 python

    因为需要将命名实体中的组织机构名进一步区分为政府.企业.社会组织等,在easydl上做了一个文本分类模型,但是要用这个接口时候发现, 官方文档中竟然还在用urllib2的库,且不完整.好多地方会报错, ...

  7. 文本分类学习 (十)构造机器学习Libsvm 的C# wrapper(调用c/c++动态链接库)

    前言: 对于SVM的了解,看前辈写的博客加上读论文对于SVM的皮毛知识总算有点了解,比如线性分类器,和求凸二次规划中用到的高等数学知识.然而SVM最核心的地方应该在于核函数和求关于α函数的极值的方法: ...

  8. 文本分类需要CNN?No!fastText完美解决你的需求(前篇)

    http://blog.csdn.net/weixin_36604953/article/details/78195462?locationNum=8&fps=1 文本分类需要CNN?No!f ...

  9. 师傅领进门之6步教你跑通一个AI程序!

    欢迎大家前往腾讯云+社区,获取更多腾讯海量技术实践干货哦~ 本文由云计算基础发表于云+社区专栏 源码下载地址请点击原文查看. 初学机器学习,写篇文章mark一下,希望能为将入坑者解点惑.本文介绍一些机 ...

随机推荐

  1. 手把手教你ranorex_android自动化测试第一个示例

    要说android的自动化,那真是折腾死我了,从早期的monkeyrunner,到后来的robotium,再到最新的uiautomator,各有各的问题,总之性价比都不够高,不太适合我的使用场景.于是 ...

  2. 我的定时关机程序(MFC实现) .

    原理: 利用定时器去检查,如输入的是多少分钟后关机,就根据输入的分钟数产生一个COUNT计数器,计数器一直递减,直到1,然后执行关机.如输入的是几时几分关机,那么定时器会每次都检查系统的时间和你输入的 ...

  3. Network view

    network view 组件用来在局域网之内去同步一个游戏物体的组件属性,只响应创建它的那个客户端事件

  4. java常量和变量的定义规则,变长参数的使用

    首先是定义的一般规则,类名首字母全部大写,常量全部大写用下划线分隔,变量用驼峰形式.注意使用long赋值用L时不能写小写的L要写大写的,不然会和数字“1”傻傻分不清. 下面是举例: public cl ...

  5. allocator 类

    allcator是一个模板类 定义在memory头文件中,将内存分配与对象构造分开,分配的内存是原始的.未构造的 一.how to use 因其实一个类,则使用allcator时需要首先声明一个类对象 ...

  6. time_t

    所在的头文件为 time.h 定义为: #ifndef __TIME_T #define __TIME_T     /* 避免重复定义 time_t */ typedef long     time_ ...

  7. 安装 nodejs

    接下来使用npm命令安装express和socket.io 没有的话 用yum 安装一下 12 npm install --save expressnpm install --save socket. ...

  8. Software Version

    Software Version Time Limit : 3000/1000ms (Java/Other)   Memory Limit : 32768/32768K (Java/Other) To ...

  9. size_t, ptrdiff_t, size_type, difference_type

    size_t是unsigned类型,用于指明数组长度或下标,它必须是一个正数,std::size_t ptrdiff_t是signed类型,用于存放同一数组中两个指针之间的差距,它可以负数,std:: ...

  10. PHP正则表达式试题

    1.POSIX正则表达式扩展在PHP哪个版本被废弃了 2.请写出匹配任意数字,任意空白字符,任意单词字符的符号? 3.执行一个正则表达式匹配的函数是什么?返回的结果有哪些? 4.执行一个全局正则表达式 ...