spark编程练习

申明:以下代码仅作学习参考使用,勿使用在商业用途。

  • Wordcount
  • UserMining
  • TweetMining
  • HashtagMining
  • InvertedIndex
  • Test

Test代码

package tutorial;

import java.util.Arrays;
import java.util.List; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; public class Test { public static void main(String[] args) {
// TODO Auto-generated method stub
SparkConf conf = new SparkConf().setAppName("test").setMaster("spark://master:7077");
@SuppressWarnings("resource")
JavaSparkContext sc = new JavaSparkContext(conf);
// sc.addJar("/home/sun/jars/myjar.jar");
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> distData = sc.parallelize(data);
System.out.println(distData.count());
} }

两个Utils方法

1.解析json的代码

package utils;

import java.io.IOException;

import com.fasterxml.jackson.databind.ObjectMapper;

public class Parse {

  public static Tweet parseJsonToTweet(String jsonLine) {

    ObjectMapper objectMapper = new ObjectMapper();
Tweet tweet = null; try {
tweet = objectMapper.readValue(jsonLine, Tweet.class);
} catch (IOException e) {
e.printStackTrace();
}
return tweet;
}
}

2.实体类

package utils;

import java.io.Serializable;

public class Tweet implements Serializable {

  long id;
String user;
String userName;
String text;
String place;
String country;
String lang; public String getUserName() {
return userName;
} public String getLang() {
return lang;
} public long getId() {
return id;
} public String getUser() { return user;} public String getText() {
return text;
} public String getPlace() {
return place;
} public String getCountry() {
return country;
} public void setId(long id) {
this.id = id;
} public void setUser(String user) {
this.user = user;
} public void setUserName(String userName) {
this.userName = userName;
} public void setText(String text) {
this.text = text;
} public void setPlace(String place) {
this.place = place;
} public void setCountry(String country) {
this.country = country;
} public void setLang(String lang) {
this.lang = lang;
} @Override
public String toString(){
return getId() + ", " + getUser() + ", " + getText() + ", " + getPlace() + ", " + getCountry();
}
}

reduced-tweets.json

数据以及测试代码的获取请点击 这里。

WordCount代码块

package tutorial;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction; import java.util.Arrays; /*
* step 1, the mapper:
*
* -我们为每一个单词添加属性 1.获取形如(word,1)的 JavaPairRDD<String, Integer>。单词作为key
*
* step 2, the reducer:
* -合并统计.
*
*
*/
public class Wordcount { private static String pathToFile = "data/wordcount.txt"; public JavaRDD<String> loadData() {
SparkConf conf = new SparkConf()
.setAppName("Wordcount")
.set("spark.driver.allowMultipleContexts", "true")
.setMaster("local[*]"); // here local mode. And * means you will use as much as you have cores. JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> words = sc.textFile(pathToFile).flatMap(new FlatMapFunction<String, String>(){
public Iterable call(String line) throws Exception {
return Arrays.asList( line.split(" ")) ;
}
}); return words; } /**
* Now count how much each word appears!
*/
public JavaPairRDD<String, Integer> wordcount() {
JavaRDD<String> words = loadData(); // code here
JavaPairRDD<String, Integer> couples = null; // code here
JavaPairRDD<String, Integer> result = null; return result;
} /**
* Now keep the word which appear strictly more than 4 times!
*/
public JavaPairRDD<String, Integer> filterOnWordcount() {
JavaPairRDD<String, Integer> wordcounts = wordcount(); // TODO write code here
JavaPairRDD<String, Integer> filtered = null; return filtered; } }

UserMining代码块

package tutorial;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function; import utils.Parse;
import utils.Tweet; /**
* The Java Spark API documentation:
* http://spark.apache.org/docs/latest/api/java/index.html
*
* 我们使用包含了8198个tweet数据记录。数据格式如下:
*
* {"id":"572692378957430785", "user":"Srkian_nishu :)", "text":
* "@always_nidhi @YouTube no i dnt understand bt i loved of this mve is rocking"
* , "place":"Orissa", "country":"India"}
*
* 目标: 找出user所有的tweet账户(一个user可能包含多个tweet账户,如Srkian_nishu的tweet账户有[572692378957430785,...])
*
*/
public class UserMining { private static String pathToFile = "data/reduced-tweets.json"; public JavaRDD<Tweet> loadData() {
// Create spark configuration and spark context
SparkConf conf = new SparkConf().setAppName("User mining").set("spark.driver.allowMultipleContexts", "true")
.setMaster("local[*]"); JavaSparkContext sc = new JavaSparkContext(conf); // Load the data and parse it into a Tweet.
// Look at the Tweet Object in the TweetUtils class.
JavaRDD<Tweet> tweets = sc.textFile(pathToFile).map(new Function<String, Tweet>() {
public Tweet call(String line) throws Exception {
// TODO Auto-generated method stub
return Parse.parseJsonToTweet(line);
} }); return tweets;
} /**
* For each user return all his tweets
*/
public JavaPairRDD<String, Iterable<Tweet>> tweetsByUser() {
JavaRDD<Tweet> tweets = loadData(); // TODO write code here
// Hint: the Spark API provides a groupBy method
JavaPairRDD<String, Iterable<Tweet>> tweetsByUser = null; return tweetsByUser;
} /**
* Compute the number of tweets by user
*/
public JavaPairRDD<String, Integer> tweetByUserNumber() {
JavaRDD<Tweet> tweets = loadData(); // TODO write code here
// Hint: think about what you did in the wordcount example
JavaPairRDD<String, Integer> count = null; return count;
} }

TweetMining代码块

package tutorial;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction; import scala.Tuple2;
import utils.Parse;
import utils.Tweet; import java.io.Serializable;
import java.util.HashSet;
import java.util.List;
import java.util.Set; /**
* The Java Spark API documentation:
* http://spark.apache.org/docs/latest/api/java/index.html
** 我们使用包含了8198个tweet数据记录。数据格式如下:
*
* {"id":"572692378957430785", "user":"Srkian_nishu :)", "text":
* "@always_nidhi @YouTube no i dnt understand bt i loved of this mve is rocking"
* , "place":"Orissa", "country":"India"}
*
* 目标: 1.找出所有被@的人
* 2.计算每个人被@到的次数,找出前10个@次数最多的人
*
*
* Use the TweetMiningTest to implement the code.
*/
public class TweetMining implements Serializable { /**
*
*/ private static String pathToFile = "data/reduced-tweets.json"; /**
* Load the data from the json file and return an RDD of Tweet
*/
public JavaRDD<Tweet> loadData() {
// create spark configuration and spark context
SparkConf conf = new SparkConf().setAppName("Tweet mining").setMaster("spark://master:7077");
conf.set("spark.driver.allowMultipleContexts" ,"true");
JavaSparkContext sc = new JavaSparkContext(conf);
sc.addJar("/home/sun/jars/tutorial-all.jar"); // load the data and create an RDD of Tweet
JavaRDD<Tweet> tweets = sc.textFile("hdfs://master:9000/sparkdata/reduced-tweets.json")
.map(new Function<String, Tweet>() {
public Tweet call(String line) throws Exception {
// TODO Auto-generated method stub
return Parse.parseJsonToTweet(line);
} });
return tweets;
} /**
* Find all the persons mentioned on tweets (case sensitive)
*/
public JavaRDD<String> mentionOnTweet() {
JavaRDD<Tweet> tweets = loadData(); // You want to return an RDD with the mentions
// Hint: think about separating the word in the text field and then find
// the mentions
// TODO write code here
JavaRDD<String> mentions = tweets.flatMap(new FlatMapFunction<Tweet, String>() {
public Iterable<String> call(Tweet t) throws Exception {
String text = t.getText();
Set<String> set = new HashSet<String>();
String[] words = text.split(" ");
for (String word : words) {
if (word.startsWith("@")) {
set.add(word);
}
}
return set;
} }); return mentions; } /**
* Count how many times each person is mentioned
*/
public JavaPairRDD<String, Integer> countMentions() {
JavaRDD<String> mentions = mentionOnTweet(); // Hint: think about what you did in the wordcount example
// TODO write code here
JavaPairRDD<String, Integer> mentionCount = mentions.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String t) throws Exception {
return new Tuple2<String, Integer>(t, 1);
}
}).reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer v1, Integer v2) throws Exception {
// TODO Auto-generated method stub
return v1 + v2;
}
});
mentionCount.saveAsTextFile("hdfs://master:9000/sparkdata/tweets-m4");
return mentionCount;
} /**
* Find the 10 most mentioned persons by descending order
*/
public List<Tuple2<Integer, String>> top10mentions() {
JavaPairRDD<String, Integer> counts = countMentions(); // Hint: take a look at the sorting and take methods
// TODO write code here
List<Tuple2<Integer, String>> mostMentioned = null; return mostMentioned;
} public static void main(String[] args) {
Ex2TweetMining ex2TweetMining = new Ex2TweetMining();
JavaPairRDD<String, Integer> res = ex2TweetMining.countMentions();
System.out.println(res.take(1));
}
}

HashtagMining代码块

package tutorial;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function; import scala.Tuple2;
import utils.Parse;
import utils.Tweet; import java.util.List; /**
* The Java Spark API documentation: http://spark.apache.org/docs/latest/api/java/index.html
*
* 我们使用包含了8198个tweet数据记录。数据格式如下:
*
* {"id":"572692378957430785", "user":"Srkian_nishu :)", "text":
* "@always_nidhi @YouTube no i dnt understand bt i loved of this mve is rocking"
* , "place":"Orissa", "country":"India"}
*
* 目标: 1.找出所有所有被标记(”#“)到的人。
* 2.找出每个被标记(“#”)的人被(”@“)到的次数,求出次数前十
*
*
*/
public class HashtagMining { private static String pathToFile = "data/reduced-tweets.json"; /**
* Load the data from the json file and return an RDD of Tweet
*/
public JavaRDD<Tweet> loadData() {
// create spark configuration and spark context
SparkConf conf = new SparkConf()
.setAppName("Hashtag mining")
.set("spark.driver.allowMultipleContexts", "true")
.setMaster("local[*]"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<Tweet> tweets = sc.textFile(pathToFile).map(new Function<String, Tweet>() {
public Tweet call(String line) throws Exception {
// TODO Auto-generated method stub
return Parse.parseJsonToTweet(line);
} }); return tweets;
} /**
* Find all the hashtags mentioned on tweets
*/
public JavaRDD<String> hashtagMentionedOnTweet() {
JavaRDD<Tweet> tweets = loadData(); // You want to return an RDD with the mentions
// Hint: think about separating the word in the text field and then find the mentions
// TODO write code here
JavaRDD<String> mentions = null; return mentions;
} /**
* Count how many times each hashtag is mentioned
*/
public JavaPairRDD<String,Integer> countMentions() {
JavaRDD<String> mentions = hashtagMentionedOnTweet(); // Hint: think about what you did in the wordcount example
// TODO write code here
JavaPairRDD<String, Integer> counts = null; return counts;
} /**
* Find the 10 most popular Hashtags by descending order
*/
public List<Tuple2<Integer, String>> top10HashTags() {
JavaPairRDD<String, Integer> counts = countMentions(); // Hint: take a look at the sorting and take methods
// TODO write code here
List<Tuple2<Integer, String>> top10 = null; return top10;
} }

InvertedIndex代码块

package tutorial;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function; import utils.Parse;
import utils.Tweet; import java.util.Map; /** * 目标 : 建立标记的索引视图
*
* 说明: 例如对于标记#spark,它出现在tweet1, tweet3, tweet39中。 建立的索引应该返回(#spark, List(tweet1,tweet3, tweet39))
*
*/
public class InvertedIndex { private static String pathToFile = "data/reduced-tweets.json"; /**
* Load the data from the json file and return an RDD of Tweet
*/
public JavaRDD<Tweet> loadData() {
// create spark configuration and spark context
SparkConf conf = new SparkConf()
.setAppName("Inverted index")
.set("spark.driver.allowMultipleContexts", "true")
.setMaster("local[*]"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<Tweet> tweets = sc.textFile(pathToFile).map(new Function<String, Tweet>() {
public Tweet call(String line) throws Exception {
// TODO Auto-generated method stub
return Parse.parseJsonToTweet(line);
} }); return tweets;
} public Map<String, Iterable<Tweet>> invertedIndex() {
JavaRDD<Tweet> tweets = loadData(); // for each tweet, extract all the hashtag and then create couples (hashtag,tweet)
// Hint: see the flatMapToPair method
// TODO write code here
JavaPairRDD<String, Tweet> pairs = null; // We want to group the tweets by hashtag
// TODO write code here
JavaPairRDD<String, Iterable<Tweet>> tweetsByHashtag = null; // Then return the inverted index (= a map structure)
// TODO write code here
Map<String, Iterable<Tweet>> map = null; return map;
} }

spark1.4的本地模式编程练习(1)的更多相关文章

  1. Eclipse的下载、安装和WordCount的初步使用(本地模式和集群模式)

    包括:    Eclipse的下载 Eclipse的安装 Eclipse的使用 本地模式或集群模式 Scala IDE for Eclipse的下载.安装和WordCount的初步使用(本地模式和集群 ...

  2. IntelliJ IDEA的下载、安装和WordCount的初步使用(本地模式和集群模式)

    包括: IntelliJ IDEA的下载  IntelliJ IDEA的安装 IntelliJ IDEA中的scala插件安装 用SBT方式来创建工程 或 选择Scala方式来创建工程 本地模式或集群 ...

  3. IntelliJ IDEA(Community版本)的下载、安装和WordCount的初步使用(本地模式和集群模式)

    不多说,直接上干货! 对于初学者来说,建议你先玩玩这个免费的社区版,但是,一段时间,还是去玩专业版吧,这个很简单哈,学聪明点,去搞到途径激活!可以看我的博客. 包括: IntelliJ IDEA(Co ...

  4. Scala IDE for Eclipse的下载、安装和WordCount的初步使用(本地模式和集群模式)

    包括: Scala IDE for Eclipse的下载  Scala IDE for Eclipse的安装 本地模式或集群模式 我们知道,对于开发而言,IDE是有很多个选择的版本.如我们大部分人经常 ...

  5. IntelliJ IDEA(Ultimate版本)的下载、安装和WordCount的初步使用(本地模式和集群模式)

    不多说,直接上干货! IntelliJ IDEA号称当前Java开发效率最高的IDE工具.IntelliJ IDEA有两个版本:社区版(Community)和旗舰版(Ultimate).社区版时免费的 ...

  6. 大数据入门第八天——MapReduce详解(四)本地模式运行与join实例

    一.本地模式调试MR程序 1.准备 参考之前随笔的windows开发说明处:http://www.cnblogs.com/jiangbei/p/8366238.html 2.流程 最重要的是设置Loc ...

  7. 八、hive3.1.2 安装及其配置(本地模式和远程模式)

    目录 前文 hive3.1.2 安装及其配置(本地模式和远程模式) 1.下载hive包 2.修改其环境变量 3.MySQL配置 Centos7 MySQL安装步骤: 1.设置MySQL源 2.安装My ...

  8. Hive的三种安装方式(内嵌模式,本地模式远程模式)

    一.安装模式介绍:     Hive官网上介绍了Hive的3种安装方式,分别对应不同的应用场景.     1.内嵌模式(元数据保村在内嵌的derby种,允许一个会话链接,尝试多个会话链接时会报错)   ...

  9. Windows下nodejs 模块配置 全局模式与本地模式的区别

    第1步:下载.安装文件 (nodejs的官网http://www.nodejs.org/download/ ) 第2步:安装相关模块环境 打开C:\Program Files\nodejs 目录你会发 ...

随机推荐

  1. 【Java编程进阶-1】enum枚举的使用

    枚举主要用于枚举常量,下面举个简单的应用. 比如一个公司有如下几个部门: 研发部: 销售部: 财务部: (其他部门暂时不列举) 部门的某些信息相对固定,此时可以考虑使用枚举来说明: 枚举类 Depts ...

  2. Maven仓库—Nexus环境搭建及简单介绍

    1.    环境搭建 1.1  下载 http://www.sonatype.org/nexus/ NEXUS OSS [OSS = Open Source Software,开源软件--免费] NE ...

  3. 黄聪:利用OpenXml生成Word2007文档(转)

    原文:http://blog.csdn.net/francislaw/article/details/7568317 版权声明:本文为博主原创文章,未经博主允许不得转载.   目录(?)[-] 一Op ...

  4. PHP转换UTF-8和GB2312的URL编码(转)

    目前WEB的应用中, UTF-8编码和GB2312编码是并存在的,例如百度(baidu.com)和谷歌(google.com)的URL编码分别是GB2312编码和UTF-8编码.由于编码并存引起的乱码 ...

  5. C#学习笔记三: C#2.0泛型 可控类型 匿名方法和迭代器

    前言 C#1.0的委托特性使方法作为其他方法的参数来传递,而C#2.0 中提出的泛型特性则使类型可以被参数化,从而不必再为不同的类型提供特殊版本的实现方法.另外C#2.0还提出了可空类型,匿名方法和迭 ...

  6. python3读取文件

    #coding:utf-8 rfile = open('test.txt','r') str=[] for x in rfile: str = x.split(',') for x in str: p ...

  7. 如何实现一个malloc

    任何一个用过或学过C的人对malloc都不会陌生.大家都知道malloc可以分配一段连续的内存空间,并且在不再使用时可以通过free释放掉.但是,许多程序员对malloc背后的事情并不熟悉,许多人甚至 ...

  8. GL_Oracle Erp月结和年节流程讨论(概念)

    2014-02-06 Created By BaoXinjian

  9. bzoj2005 能量采集 gcd 容斥

    ans = sigma_x(sigma_y(gcd(x,y) * 2 - 1)),1<=x<=n,1<=y<=m 枚举x,y,O(nmlogn),超时 换个角度,枚举d = g ...

  10. web提前做好测试

    1.压力测试,找到极限点和瓶颈,最小化扩容2.消息队列应对高并发的写操作 根据数据大小分成不同队列,保证效率 堵塞队列,压队列机极限处理能力3.主要业务和次要业务分开,当出现异常时保障主要业务,保证系 ...