二分类:SVMs,logistic regression,decision trees,random forests,gradient-boosted trees,naive Bayes

多分类:             logistic regression,decision trees,random forests,                                        naive Bayes

归回:      linear least regression,    decision tress,random forests,gradient-boosted trees,                       isotonic regression。

一。Linear models

  

   

    

  classification (SVMs, logistic regression)

   

 package ML.ClassificationAndRegression;

 import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.classification.SVMModel;
import org.apache.spark.mllib.classification.SVMWithSGD;
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics;
import org.apache.spark.mllib.optimization.L1Updater;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.rdd.RDD;
import scala.Tuple2; /**
* TODO
*
* @ClassName: SVMClassifier
* @author: DingH
* @since: 2019/4/9 10:28
*/
public class SVMClassifier {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("SVM Classifier Example").setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(conf);
String path = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_libsvm_data.txt";
JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD(); // Split initial RDD into two... [60% training data, 40% testing data].
JavaRDD<LabeledPoint> train = data.sample(false, 0.6, 11L);
train.cache();
final JavaRDD<LabeledPoint> test = data.subtract(train); //Run training algorithm to build the model
int numsIterations = 100;
SVMWithSGD svm = new SVMWithSGD();
svm.optimizer().setNumIterations(200).setRegParam(0.01).setUpdater(new L1Updater());
final SVMModel model1 = svm.run(train.rdd());
// final SVMModel model1 = SVMWithSGD.train(train.rdd(), numsIterations); model1.clearThreshold(); JavaRDD<Tuple2<Object, Object>> scoraAndLables = test.map(new Function<LabeledPoint, Tuple2<Object, Object>>() {
public Tuple2<Object, Object> call(LabeledPoint p) throws Exception {
double predict = model1.predict(p.features());
return new Tuple2<Object, Object>(predict, p.label());
}
}); BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(scoraAndLables.rdd()); double areaUnderROC = metrics.areaUnderROC(); System.out.println("Area under ROC = " + areaUnderROC); model1.save(jsc.sc(),"D:\\IdeaProjects\\SimpleApp\\src\\main\\java\\MLModel");
SVMModel model = SVMModel.load(jsc.sc(), "D:\\IdeaProjects\\SimpleApp\\src\\main\\java\\MLModel"); }
}

SVMClassifier

 package ML.ClassificationAndRegression;

 import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.ml.classification.MultiClassSummarizer;
import org.apache.spark.mllib.classification.LogisticRegressionModel;
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS;
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD;
import org.apache.spark.mllib.evaluation.MulticlassMetrics;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import scala.Tuple2; /**
* TODO
*
* @ClassName: LogistiRegression
* @author: DingH
* @since: 2019/4/9 11:08
*/
public class LogistiRegression {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("LogisticRegression");
JavaSparkContext jsc = new JavaSparkContext(conf);
String path = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_libsvm_data.txt";
JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD(); JavaRDD<LabeledPoint>[] split = data.randomSplit(new double[]{0.6, 0.4}, 11L);
JavaRDD<LabeledPoint> training = split[0].cache();
final JavaRDD<LabeledPoint> test = split[1]; final LogisticRegressionModel model = new LogisticRegressionWithLBFGS().setNumClasses(10).run(training.rdd());
JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(new Function<LabeledPoint, Tuple2<Object, Object>>() {
public Tuple2<Object, Object> call(LabeledPoint labeledPoint) throws Exception {
double predict = model.predict(labeledPoint.features());
return new Tuple2<Object, Object>(predict, labeledPoint.label());
}
}); MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); double precision = metrics.precision();
System.out.println("Precision = " + precision); // Save and load model
// model.save(jsc.sc(), "myModelPath");
// LogisticRegressionModel sameModel = LogisticRegressionModel.load(jsc.sc(), "myModelPath");
}
}

LogistiRegression

  linear regression (least squares, Lasso, ridge)

 

 package ML.ClassificationAndRegression;

 import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.regression.LinearRegressionModel;
import org.apache.spark.mllib.regression.LinearRegressionWithSGD;
import org.apache.spark.mllib.util.MLUtils;
import scala.Tuple2; /**
* TODO
*
* @ClassName: Regression
* @author: DingH
* @since: 2019/4/9 11:21
*/
public class Regression {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("Regression").setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(conf);
String path = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\ridge-data\\lpsa.data";
JavaRDD<String> data = jsc.textFile(path); JavaRDD<LabeledPoint> parsedData = data.map(new Function<String, LabeledPoint>() {
public LabeledPoint call(String line) throws Exception {
String[] split = line.split(",");
String[] features = split[1].split(" ");
double[] v = new double[features.length];
for (int i = 0; i < features.length - 1; i++) {
v[i] = Double.parseDouble(features[i]);
} return new LabeledPoint(Double.parseDouble(split[0]), Vectors.dense(v));
}
}).cache(); final LinearRegressionModel model = LinearRegressionWithSGD.train(parsedData.rdd(), 100); JavaRDD<Tuple2<Double, Double>> valuesAndLabels = parsedData.map(new Function<LabeledPoint, Tuple2<Double, Double>>() {
public Tuple2<Double, Double> call(LabeledPoint labeledPoint) throws Exception {
double predict = model.predict(labeledPoint.features());
return new Tuple2<Double, Double>(predict, labeledPoint.label());
}
}); Double MSE = new JavaDoubleRDD(valuesAndLabels.map(
new Function<Tuple2<Double, Double>, Object>() {
public Object call(Tuple2<Double, Double> dat) throws Exception {
return Math.pow(dat._1 - dat._2, 2.0);
}
}
).rdd()).mean();
System.out.println("training Mean Squared Error = " + MSE); // Save and load model
// model.save(jsc.sc(), "myModelPath");
// LinearRegressionModel sameModel = LinearRegressionModel.load(jsc.sc(), "myModelPath");
}
}

Regression

二。Decision Trees.

 

problem specification parameters:   algo, numClasses, categoricalFeaturesInfo

stopping criteria : maxDepth, minInfoGain, minInstancePerNode

tunnable parameters: maxBins, impurity,

package ML.DT;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.tree.DecisionTree;
import org.apache.spark.mllib.tree.model.DecisionTreeModel;
import org.apache.spark.mllib.util.MLUtils;
import scala.Tuple2; import java.util.HashMap; /**
* TODO
*
* @ClassName: classification
* @author: DingH
* @since: 2019/4/9 16:11
*/
public class classification {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("DTclassification").setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(conf); String path = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_libsvm_data.txt";
JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD(); JavaRDD<LabeledPoint>[] split = data.randomSplit(new double[]{0.7, 0.3}, 11L);
JavaRDD<LabeledPoint> trainningData = split[0];
JavaRDD<LabeledPoint> test = split[1]; int numsClasses = 2;
HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
String impurity = "gini";
int maxDepth = 1;
int maxbins = 32; final DecisionTreeModel model = DecisionTree.trainClassifier(trainningData, numsClasses,categoricalFeaturesInfo, impurity, maxDepth,maxbins);
JavaPairRDD<Double, Double> predictionAndLable = test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
public Tuple2<Double, Double> call(LabeledPoint labeledPoint) throws Exception {
return new Tuple2<Double, Double>(model.predict(labeledPoint.features()), labeledPoint.label());
}
}); double testErr = predictionAndLable.filter(new Function<Tuple2<Double, Double>, Boolean>() {
public Boolean call(Tuple2<Double, Double> doubleDoubleTuple2) throws Exception {
return !doubleDoubleTuple2._1().equals(doubleDoubleTuple2._2());
}
}).count() * 1.0 / test.count(); System.out.println("Test Error: " + testErr);
System.out.println("Learned classification tree model:\n" + model.toDebugString()); // Save and load model
// model.save(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel");
// DecisionTreeModel sameModel = DecisionTreeModel.load(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel"); }
}

classification

package ML.DT;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.tree.DecisionTree;
import org.apache.spark.mllib.tree.model.DecisionTreeModel;
import org.apache.spark.mllib.util.MLUtils;
import scala.Tuple2; import java.util.HashMap;
import java.util.Map; /**
* TODO
*
* @ClassName: Regression
* @author: DingH
* @since: 2019/4/9 16:33
*/
public class Regression {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTreeRegressionExample").setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(sparkConf); // Load and parse the data file.
String datapath = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_libsvm_data.txt";
JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
// Split the data into training and test sets (30% held out for testing)
JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
JavaRDD<LabeledPoint> trainingData = splits[0];
JavaRDD<LabeledPoint> testData = splits[1]; // Set parameters.
// Empty categoricalFeaturesInfo indicates all features are continuous.
Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
String impurity = "variance";
Integer maxDepth = 5;
Integer maxBins = 32; // Train a DecisionTree model.
final DecisionTreeModel model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, impurity, maxDepth, maxBins); // Evaluate model on test instances and compute test error
JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
public Tuple2<Double, Double> call(LabeledPoint p) {
return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
}
}); Double testMSE = predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
public Double call(Tuple2<Double, Double> pl) {
Double diff = pl._1() - pl._2();
return diff * diff;
}
}).reduce(new Function2<Double, Double, Double>() {
public Double call(Double a, Double b) {
return a + b;
}
}) / data.count(); System.out.println("Test Mean Squared Error: " + testMSE);
System.out.println("Learned regression tree model:\n" + model.toDebugString()); // Save and load model
// model.save(jsc.sc(), "target/tmp/myDecisionTreeRegressionModel");
// DecisionTreeModel sameModel = DecisionTreeModel.load(jsc.sc(), "target/tmp/myDecisionTreeRegressionModel");
}
}

Regression

三。Random Forests

  样本随机,特征随机

  featureSubsetStrategy - Number of features to consider for splits at each node. Supported: "auto", "all", "sqrt", "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: if numTrees == 1, set to "all"; if numTrees > 1 (forest) set to "sqrt".

package ML.RF;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.tree.RandomForest;
import org.apache.spark.mllib.tree.model.RandomForestModel;
import org.apache.spark.mllib.util.MLUtils;
import scala.Tuple2; import java.util.HashMap; /**
* TODO
*
* @ClassName: classification
* @author: DingH
* @since: 2019/4/9 16:58
*/
public class classification {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("JavaRandomForestClassificationExample");
JavaSparkContext jsc = new JavaSparkContext(sparkConf); // Load and parse the data file.
String datapath = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_libsvm_data.txt";
JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); // Split the data into training and test sets (30% held out for testing)
JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
JavaRDD<LabeledPoint> trainingData = splits[0];
JavaRDD<LabeledPoint> testData = splits[1]; // Train a RandomForest model.
// Empty categoricalFeaturesInfo indicates all features are continuous.
Integer numClasses = 2;
HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
Integer numTrees = 3; // Use more in practice.
String featureSubsetStrategy = "auto"; // Let the algorithm choose.
String impurity = "gini";
Integer maxDepth = 5;
Integer maxBins = 32;
Integer seed = 12345; final RandomForestModel model = RandomForest.trainClassifier(trainingData, numClasses,categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed); // Evaluate model on test instances and compute test error
JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
public Tuple2<Double, Double> call(LabeledPoint p) {
return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
}
}); Double testErr =
1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
public Boolean call(Tuple2<Double, Double> pl) {
return !pl._1().equals(pl._2());
}
}).count() / testData.count(); System.out.println("Test Error: " + testErr);
System.out.println("Learned classification forest model:\n" + model.toDebugString()); // Save and load model
// model.save(jsc.sc(), "target/tmp/myRandomForestClassificationModel");
// RandomForestModel sameModel = RandomForestModel.load(jsc.sc(),"target/tmp/myRandomForestClassificationModel");
} }

classification

package ML.RF;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.tree.RandomForest;
import org.apache.spark.mllib.tree.model.RandomForestModel;
import org.apache.spark.mllib.util.MLUtils;
import scala.Tuple2; import java.util.HashMap;
import java.util.Map; /**
* TODO
*
* @ClassName: regression
* @author: DingH
* @since: 2019/4/9 17:50
*/
public class regression {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("JavaRandomForestRegressionExample");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
// Load and parse the data file.
String datapath = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_libsvm_data.txt";
JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); // Split the data into training and test sets (30% held out for testing)
JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
JavaRDD<LabeledPoint> trainingData = splits[0];
JavaRDD<LabeledPoint> testData = splits[1]; // Set parameters.
// Empty categoricalFeaturesInfo indicates all features are continuous.
Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
Integer numTrees = 3; // Use more in practice.
String featureSubsetStrategy = "auto"; // Let the algorithm choose.
String impurity = "variance";
Integer maxDepth = 4;
Integer maxBins = 32;
Integer seed = 12345;
// Train a RandomForest model.
final RandomForestModel model = RandomForest.trainRegressor(trainingData,categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed); // Evaluate model on test instances and compute test error
JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
public Tuple2<Double, Double> call(LabeledPoint p) {
return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
}
});
Double testMSE = predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
public Double call(Tuple2<Double, Double> pl) {
Double diff = pl._1() - pl._2();
return diff * diff;
}
}).reduce(new Function2<Double, Double, Double>() {
public Double call(Double a, Double b) {
return a + b;
}
}) / testData.count(); System.out.println("Test Mean Squared Error: " + testMSE);
System.out.println("Learned regression forest model:\n" + model.toDebugString()); // Save and load model
model.save(jsc.sc(), "target/tmp/myRandomForestRegressionModel");
RandomForestModel sameModel = RandomForestModel.load(jsc.sc(),
"target/tmp/myRandomForestRegressionModel");
}
}

regression

四。Gradient-Boosted Trees

  

Usage tips: loss, numIterations, learningRate,  algo

BoostingStrategy.validationTol

package ML.GradientBoostedTrees;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.tree.GradientBoostedTrees;
import org.apache.spark.mllib.tree.configuration.BoostingStrategy;
import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel;
import org.apache.spark.mllib.util.MLUtils;
import scala.Tuple2; import java.util.HashMap;
import java.util.Map; /**
* TODO
*
* @ClassName: classification
* @author: DingH
* @since: 2019/4/9 17:56
*/
public class classification {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("JavaGradientBoostedTreesClassificationExample");
JavaSparkContext jsc = new JavaSparkContext(sparkConf); // Load and parse the data file.
String datapath = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_libsvm_data.txt";
JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); // Split the data into training and test sets (30% held out for testing)
JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
JavaRDD<LabeledPoint> trainingData = splits[0];
JavaRDD<LabeledPoint> testData = splits[1]; // Train a GradientBoostedTrees model.
// The defaultParams for Classification use LogLoss by default.
BoostingStrategy boostingStrategy = BoostingStrategy.defaultParams("Classification");
boostingStrategy.setNumIterations(3); // Note: Use more iterations in practice.
boostingStrategy.getTreeStrategy().setNumClasses(2);
boostingStrategy.getTreeStrategy().setMaxDepth(5);
// Empty categoricalFeaturesInfo indicates all features are continuous.
Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
boostingStrategy.treeStrategy().setCategoricalFeaturesInfo(categoricalFeaturesInfo); final GradientBoostedTreesModel model =
GradientBoostedTrees.train(trainingData, boostingStrategy); // Evaluate model on test instances and compute test error
JavaPairRDD<Double, Double> predictionAndLabel =
testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
public Tuple2<Double, Double> call(LabeledPoint p) {
return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
}
});
Double testErr =
1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
public Boolean call(Tuple2<Double, Double> pl) {
return !pl._1().equals(pl._2());
}
}).count() / testData.count();
System.out.println("Test Error: " + testErr);
System.out.println("Learned classification GBT model:\n" + model.toDebugString()); // Save and load model
// model.save(jsc.sc(), "target/tmp/myGradientBoostingClassificationModel");
// GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(jsc.sc(),
// "target/tmp/myGradientBoostingClassificationModel");
}
}

classification

regression

五。naive Bayes

  model type : "multinomial","bernouli"

  

 package ML.ClassificationAndRegression.NaiveBayes;

 import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.classification.NaiveBayes;
import org.apache.spark.mllib.classification.NaiveBayesModel;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import scala.Tuple2; /**
* TODO
*
* @ClassName: example
* @author: DingH
* @since: 2019/4/10 10:04
*/
public class example {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("naiveBayesExample");
JavaSparkContext jsc = new JavaSparkContext(conf); String path = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_libsvm_data.txt";
JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD(); JavaRDD<LabeledPoint>[] split = data.randomSplit(new double[]{0.6, 0.4}, 12345L);
JavaRDD<LabeledPoint> train = split[0];
JavaRDD<LabeledPoint> test = split[1]; final NaiveBayesModel model = NaiveBayes.train(train.rdd(), 1.0, "multinomial"); JavaRDD<Tuple2<Double, Double>> predictionAndLabel = test.map(new Function<LabeledPoint, Tuple2<Double, Double>>() {
public Tuple2<Double, Double> call(LabeledPoint labeledPoint) throws Exception {
double predict = model.predict(labeledPoint.features());
return new Tuple2<Double, Double>(predict, labeledPoint.label());
}
}); double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
public Boolean call(Tuple2<Double, Double> doubleDoubleTuple2) throws Exception {
return doubleDoubleTuple2._1().equals(doubleDoubleTuple2._2());
}
}).count() / (double) test.count(); System.out.println("acucuracy is : " + accuracy); NaiveBayesModel model1 = NaiveBayesModel.load(jsc.sc(), ""); }
}

naive Bayes

六。isotonic regression

  

  

 package ML.ClassificationAndRegression.IsotonicRegression;

 import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.regression.IsotonicRegression;
import org.apache.spark.mllib.regression.IsotonicRegressionModel;
import scala.Tuple2;
import scala.Tuple3; /**
* TODO
*
* @ClassName: example
* @author: DingH
* @since: 2019/4/10 10:31
*/
public class example {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("isotonicRegressionExample");
JavaSparkContext jsc = new JavaSparkContext(conf); String path = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_isotonic_regression_data.txt";
JavaRDD<String> data = jsc.textFile(path); JavaRDD<Tuple3<Double, Double, Double>> parsedData = data.map(new Function<String, Tuple3<Double, Double, Double>>() {
public Tuple3<Double, Double, Double> call(String s) throws Exception {
String[] strings = s.split(",");
return new Tuple3<Double, Double, Double>(Double.parseDouble(strings[0]), Double.parseDouble(strings[1]), 1.0);
}
}); JavaRDD<Tuple3<Double, Double, Double>>[] split = parsedData.randomSplit(new double[]{0.7, 0.3}, 1234L);
JavaRDD<Tuple3<Double, Double, Double>> train = split[0];
JavaRDD<Tuple3<Double, Double, Double>> test = split[1]; final IsotonicRegressionModel model = new IsotonicRegression().setIsotonic(true).run(train); JavaRDD<Tuple2<Double, Double>> preditionAndLabel = test.map(new Function<Tuple3<Double, Double, Double>, Tuple2<Double, Double>>() {
public Tuple2<Double, Double> call(Tuple3<Double, Double, Double> doubleDoubleDoubleTuple3) throws Exception {
double predict = model.predict(doubleDoubleDoubleTuple3._1());
return new Tuple2<Double, Double>(predict, doubleDoubleDoubleTuple3._2());
}
}); Double MSE = new JavaDoubleRDD(preditionAndLabel.map(new Function<Tuple2<Double, Double>, Object>() {
public Object call(Tuple2<Double, Double> doubleDoubleTuple2) throws Exception { return Math.pow(doubleDoubleTuple2._1() - doubleDoubleTuple2._2(), 2.0);
}
}).rdd()).mean(); System.out.println("Mean Squared Error = " + MSE);
}
}

isotonic regression

      

      

    

spark MLlib Classification and regression 学习的更多相关文章

  1. Spark MLlib Deep Learning Convolution Neural Network (深度学习-卷积神经网络)3.1

    3.Spark MLlib Deep Learning Convolution Neural Network (深度学习-卷积神经网络)3.1 http://blog.csdn.net/sunbow0 ...

  2. Spark MLlib Deep Learning Deep Belief Network (深度学习-深度信念网络)2.1

    Spark MLlib Deep Learning Deep Belief Network (深度学习-深度信念网络)2.1 http://blog.csdn.net/sunbow0 Spark ML ...

  3. Spark入门实战系列--8.Spark MLlib(上)--机器学习及SparkMLlib简介

    [注]该系列文章以及使用到安装包/测试数据 可以在<倾情大奉送--Spark入门实战系列>获取 .机器学习概念 1.1 机器学习的定义 在维基百科上对机器学习提出以下几种定义: l“机器学 ...

  4. Spark MLlib 机器学习

    本章导读 机器学习(machine learning, ML)是一门涉及概率论.统计学.逼近论.凸分析.算法复杂度理论等多领域的交叉学科.ML专注于研究计算机模拟或实现人类的学习行为,以获取新知识.新 ...

  5. Spark MLlib架构解析(含分类算法、回归算法、聚类算法和协同过滤)

    Spark MLlib架构解析 MLlib的底层基础解析 MLlib的算法库分析 分类算法 回归算法 聚类算法 协同过滤 MLlib的实用程序分析 从架构图可以看出MLlib主要包含三个部分: 底层基 ...

  6. spark Mllib基本功系列编程入门之 SVM实现分类

    话不多说.直接上代码咯.欢迎交流. /** * Created by whuscalaman on 1/7/16. */import org.apache.spark.{SparkConf, Spar ...

  7. spark Mllib SVM实例

    Mllib SVM实例 1.数据 数据格式为:标签, 特征1 特征2 特征3…… 0 128:51 129:159 130:253 131:159 132:50 155:48 156:238 157: ...

  8. spark MLlib实现的基于朴素贝叶斯(NaiveBayes)的中文文本自动分类

    1.自动文本分类是对大量的非结构化的文字信息(文本文档.网页等)按照给定的分类体系,根据文字信息内容分到指定的类别中去,是一种有指导的学习过程. 分类过程采用基于统计的方法和向量空间模型可以对常见的文 ...

  9. 在Java Web中使用Spark MLlib训练的模型

    PMML是一种通用的配置文件,只要遵循标准的配置文件,就可以在Spark中训练机器学习模型,然后再web接口端去使用.目前应用最广的就是基于Jpmml来加载模型在javaweb中应用,这样就可以实现跨 ...

随机推荐

  1. 金融量化之Tushare模块

    一.介绍 Tushare是一个免费.开源的python财经数据接口包.主要实现对股票等金融数据从数据采集.清洗加工 到 数据存储的过程,能够为金融分析人员提供快速.整洁.和多样的便于分析的数据,为他们 ...

  2. Codeforces Global Round 2

    A:答案一定包含首位或末位. #include<iostream> #include<cstdio> #include<cmath> #include<cst ...

  3. docker-跨主机存储

    容器分类 从业务数据的角度看,容器可以分为两类:无状态(stateless)容器和有状态(stateful)容器. 无状态是指容器在运行过程中不需要保存数据,每次访问的结果不依赖上一次访问,比如提供静 ...

  4. Python【第四篇】函数、内置函数、递归、装饰器、生成器和迭代器

    一.函数 函数是指将一组语句的集合通过一个名字(函数名)封装起来,要想执行这个函数,只需调用其函数名即可 特性: 减少重复代码 使程序变的可扩展 使程序变得易维护 1.定义 def 函数名(参数): ...

  5. shell 基础(二)变量

    1. shell变量的定义 1)Shell 支持以下三种定义变量的方式: variable=value variable='value' variable="value" 特点 1 ...

  6. 【译】1. Java反射——引言

    原文地址:http://tutorials.jenkov.com/java-reflection/index.html *By Jakob Jenkov Java的反射机制使得它可以在运行时检查类.接 ...

  7. OS + macOS Mojave 10.14.4 / sushi / ssh-keygen / ssh-copy-id

    s 系统版本: macOS 10.14.4 (18E226) 内核版本: Darwin 18.5.0 型号名称: Mac mini 2014 型号标识符: Macmini7,1 处理器名称: Inte ...

  8. linux,pthread(转)

    互斥量.条件变量与pthread_cond_wait()函数的使用,详解(二)   1.Linux“线程” 进程与线程之间是有区别的,不过linux内核只提供了轻量进程的支持,未实现线程模型.Linu ...

  9. CentOS7部署Dotnet Core2.1

    前言 笔者在毫无Linux部署.net core的经验下,第一次用了15分钟完成部署,第二次在生产环境用了5分钟.下文将说明如何在CentOS7下完成.NetCore2.1的部署,包括如何创建ASP. ...

  10. INI配置文件的格式

    为什么要用INI文件?如果我们程序没有任何配置文件时,这样的程序对外是全封闭的,一旦程序需要修改一些参数必须要修改程序代码本身并重新编译,这样很不好,所以要用配置文件,让程序出厂后还能根据需要进行必要 ...