spark1.4的本地模式编程练习(1)
spark编程练习
申明:以下代码仅作学习参考使用,勿使用在商业用途。
- Wordcount
- UserMining
- TweetMining
- HashtagMining
- InvertedIndex
- Test
Test代码
package tutorial;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
public class Test {
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        SparkConf conf = new SparkConf().setAppName("test").setMaster("spark://master:7077");
        @SuppressWarnings("resource")
        JavaSparkContext sc = new JavaSparkContext(conf);
//      sc.addJar("/home/sun/jars/myjar.jar");
        List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);
        JavaRDD<Integer> distData = sc.parallelize(data);
        System.out.println(distData.count());
    }
}
两个Utils方法
1.解析json的代码
package utils;
import java.io.IOException;
import com.fasterxml.jackson.databind.ObjectMapper;
public class Parse {
  public static Tweet parseJsonToTweet(String jsonLine) {
    ObjectMapper objectMapper = new ObjectMapper();
    Tweet tweet = null;
    try {
      tweet = objectMapper.readValue(jsonLine, Tweet.class);
    } catch (IOException e) {
      e.printStackTrace();
    }
    return tweet;
  }
}
2.实体类
package utils;
import java.io.Serializable;
public class Tweet implements Serializable {
  long id;
  String user;
  String userName;
  String text;
  String place;
  String country;
  String lang;
  public String getUserName() {
    return userName;
  }
  public String getLang() {
    return lang;
  }
  public long getId() {
    return id;
  }
  public String getUser() { return user;}
  public String getText() {
    return text;
  }
  public String getPlace() {
    return place;
  }
  public String getCountry() {
    return country;
  }
  public void setId(long id) {
    this.id = id;
  }
  public void setUser(String user) {
    this.user = user;
  }
  public void setUserName(String userName) {
    this.userName = userName;
  }
  public void setText(String text) {
    this.text = text;
  }
  public void setPlace(String place) {
    this.place = place;
  }
  public void setCountry(String country) {
    this.country = country;
  }
  public void setLang(String lang) {
    this.lang = lang;
  }
  @Override
  public String toString(){
    return getId() + ", " + getUser() + ", " + getText() + ", " + getPlace() + ", " + getCountry();
  }
}reduced-tweets.json
数据以及测试代码的获取请点击 这里。
WordCount代码块
package tutorial;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import java.util.Arrays;
/*
 *  step 1, the mapper:
 *
 *  -我们为每一个单词添加属性 1.获取形如(word,1)的 JavaPairRDD<String, Integer>。单词作为key
 *
 *  step 2, the reducer:
 *  -合并统计.
 *
 *
 */
public class Wordcount {
  private static String pathToFile = "data/wordcount.txt";
  public JavaRDD<String> loadData() {
    SparkConf conf = new SparkConf()
        .setAppName("Wordcount")
        .set("spark.driver.allowMultipleContexts", "true")
        .setMaster("local[*]"); // here local mode. And * means you will use as much as you have cores.
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> words = sc.textFile(pathToFile).flatMap(new FlatMapFunction<String, String>(){
        public Iterable call(String line) throws Exception {
            return Arrays.asList( line.split(" ")) ;
        }
    });
    return words;
  }
  /**
   *  Now count how much each word appears!
   */
  public JavaPairRDD<String, Integer> wordcount() {
    JavaRDD<String> words = loadData();
    // code here
    JavaPairRDD<String, Integer> couples = null;
    // code here
    JavaPairRDD<String, Integer> result = null;
    return result;
  }
  /**
   *  Now keep the word which appear strictly more than 4 times!
   */
  public JavaPairRDD<String, Integer> filterOnWordcount() {
    JavaPairRDD<String, Integer> wordcounts = wordcount();
    // TODO write code here
    JavaPairRDD<String, Integer> filtered = null;
    return filtered;
  }
}
UserMining代码块
package tutorial;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import utils.Parse;
import utils.Tweet;
/**
 * The Java Spark API documentation:
 * http://spark.apache.org/docs/latest/api/java/index.html
 *
 * 我们使用包含了8198个tweet数据记录。数据格式如下:
 *
 * {"id":"572692378957430785", "user":"Srkian_nishu :)", "text":
 * "@always_nidhi @YouTube no i dnt understand bt i loved of this mve is rocking"
 * , "place":"Orissa", "country":"India"}
 *
 * 目标:   找出user所有的tweet账户(一个user可能包含多个tweet账户,如Srkian_nishu的tweet账户有[572692378957430785,...])
 *
 */
public class UserMining {
    private static String pathToFile = "data/reduced-tweets.json";
    public JavaRDD<Tweet> loadData() {
        // Create spark configuration and spark context
        SparkConf conf = new SparkConf().setAppName("User mining").set("spark.driver.allowMultipleContexts", "true")
                .setMaster("local[*]");
        JavaSparkContext sc = new JavaSparkContext(conf);
        // Load the data and parse it into a Tweet.
        // Look at the Tweet Object in the TweetUtils class.
        JavaRDD<Tweet> tweets = sc.textFile(pathToFile).map(new Function<String, Tweet>() {
            public Tweet call(String line) throws Exception {
                // TODO Auto-generated method stub
                return Parse.parseJsonToTweet(line);
            }
        });
        return tweets;
    }
    /**
     * For each user return all his tweets
     */
    public JavaPairRDD<String, Iterable<Tweet>> tweetsByUser() {
        JavaRDD<Tweet> tweets = loadData();
        // TODO write code here
        // Hint: the Spark API provides a groupBy method
        JavaPairRDD<String, Iterable<Tweet>> tweetsByUser = null;
        return tweetsByUser;
    }
    /**
     * Compute the number of tweets by user
     */
    public JavaPairRDD<String, Integer> tweetByUserNumber() {
        JavaRDD<Tweet> tweets = loadData();
        // TODO write code here
        // Hint: think about what you did in the wordcount example
        JavaPairRDD<String, Integer> count = null;
        return count;
    }
}
TweetMining代码块
package tutorial;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import utils.Parse;
import utils.Tweet;
import java.io.Serializable;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
 * The Java Spark API documentation:
 * http://spark.apache.org/docs/latest/api/java/index.html
 ** 我们使用包含了8198个tweet数据记录。数据格式如下:
 *
 * {"id":"572692378957430785", "user":"Srkian_nishu :)", "text":
 * "@always_nidhi @YouTube no i dnt understand bt i loved of this mve is rocking"
 * , "place":"Orissa", "country":"India"}
 *
 * 目标: 1.找出所有被@的人
 *      2.计算每个人被@到的次数,找出前10个@次数最多的人
 *
 *
 * Use the TweetMiningTest to implement the code.
 */
public class TweetMining implements Serializable {
    /**
     *
     */
    private static String pathToFile = "data/reduced-tweets.json";
    /**
     * Load the data from the json file and return an RDD of Tweet
     */
    public JavaRDD<Tweet> loadData() {
        // create spark configuration and spark context
        SparkConf conf = new SparkConf().setAppName("Tweet mining").setMaster("spark://master:7077");
        conf.set("spark.driver.allowMultipleContexts" ,"true");
        JavaSparkContext sc = new JavaSparkContext(conf);
        sc.addJar("/home/sun/jars/tutorial-all.jar");
        // load the data and create an RDD of Tweet
        JavaRDD<Tweet> tweets = sc.textFile("hdfs://master:9000/sparkdata/reduced-tweets.json")
                .map(new Function<String, Tweet>() {
                    public Tweet call(String line) throws Exception {
                        // TODO Auto-generated method stub
                        return Parse.parseJsonToTweet(line);
                    }
                });
        return tweets;
    }
    /**
     * Find all the persons mentioned on tweets (case sensitive)
     */
    public JavaRDD<String> mentionOnTweet() {
        JavaRDD<Tweet> tweets = loadData();
        // You want to return an RDD with the mentions
        // Hint: think about separating the word in the text field and then find
        // the mentions
        // TODO write code here
        JavaRDD<String> mentions = tweets.flatMap(new FlatMapFunction<Tweet, String>() {
            public Iterable<String> call(Tweet t) throws Exception {
                String text = t.getText();
                Set<String> set = new HashSet<String>();
                String[] words = text.split(" ");
                for (String word : words) {
                    if (word.startsWith("@")) {
                        set.add(word);
                    }
                }
                return set;
            }
        });
        return mentions;
    }
    /**
     * Count how many times each person is mentioned
     */
    public JavaPairRDD<String, Integer> countMentions() {
        JavaRDD<String> mentions = mentionOnTweet();
        // Hint: think about what you did in the wordcount example
        // TODO write code here
        JavaPairRDD<String, Integer> mentionCount = mentions.mapToPair(new PairFunction<String, String, Integer>() {
            public Tuple2<String, Integer> call(String t) throws Exception {
                return new Tuple2<String, Integer>(t, 1);
            }
        }).reduceByKey(new Function2<Integer, Integer, Integer>() {
            public Integer call(Integer v1, Integer v2) throws Exception {
                // TODO Auto-generated method stub
                return v1 + v2;
            }
        });
        mentionCount.saveAsTextFile("hdfs://master:9000/sparkdata/tweets-m4");
        return mentionCount;
    }
    /**
     * Find the 10 most mentioned persons by descending order
     */
    public List<Tuple2<Integer, String>> top10mentions() {
        JavaPairRDD<String, Integer> counts = countMentions();
        // Hint: take a look at the sorting and take methods
        // TODO write code here
        List<Tuple2<Integer, String>> mostMentioned = null;
        return mostMentioned;
    }
    public static void main(String[] args) {
        Ex2TweetMining ex2TweetMining = new Ex2TweetMining();
        JavaPairRDD<String, Integer> res = ex2TweetMining.countMentions();
        System.out.println(res.take(1));
    }
}
HashtagMining代码块
package tutorial;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import scala.Tuple2;
import utils.Parse;
import utils.Tweet;
import java.util.List;
/**
 *  The Java Spark API documentation: http://spark.apache.org/docs/latest/api/java/index.html
 *
 * 我们使用包含了8198个tweet数据记录。数据格式如下:
 *
 * {"id":"572692378957430785", "user":"Srkian_nishu :)", "text":
 * "@always_nidhi @YouTube no i dnt understand bt i loved of this mve is rocking"
 * , "place":"Orissa", "country":"India"}
 *
 * 目标: 1.找出所有所有被标记(”#“)到的人。
 *      2.找出每个被标记(“#”)的人被(”@“)到的次数,求出次数前十
 *
 *
 */
public class HashtagMining {
  private static String pathToFile = "data/reduced-tweets.json";
  /**
   *  Load the data from the json file and return an RDD of Tweet
   */
  public JavaRDD<Tweet> loadData() {
    // create spark configuration and spark context
    SparkConf conf = new SparkConf()
        .setAppName("Hashtag mining")
        .set("spark.driver.allowMultipleContexts", "true")
        .setMaster("local[*]");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<Tweet> tweets = sc.textFile(pathToFile).map(new Function<String, Tweet>() {
        public Tweet call(String line) throws Exception {
            // TODO Auto-generated method stub
            return Parse.parseJsonToTweet(line);
        }
    });
    return tweets;
  }
  /**
   *  Find all the hashtags mentioned on tweets
   */
  public JavaRDD<String> hashtagMentionedOnTweet() {
    JavaRDD<Tweet> tweets = loadData();
    // You want to return an RDD with the mentions
    // Hint: think about separating the word in the text field and then find the mentions
    // TODO write code here
    JavaRDD<String> mentions = null;
    return mentions;
  }
  /**
   *  Count how many times each hashtag is mentioned
   */
  public JavaPairRDD<String,Integer> countMentions() {
    JavaRDD<String> mentions = hashtagMentionedOnTweet();
    // Hint: think about what you did in the wordcount example
    // TODO write code here
    JavaPairRDD<String, Integer> counts = null;
    return counts;
  }
  /**
   *  Find the 10 most popular Hashtags by descending order
   */
  public List<Tuple2<Integer, String>> top10HashTags() {
    JavaPairRDD<String, Integer> counts = countMentions();
    // Hint: take a look at the sorting and take methods
    // TODO write code here
    List<Tuple2<Integer, String>> top10 = null;
    return top10;
  }
}
InvertedIndex代码块
package tutorial;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import utils.Parse;
import utils.Tweet;
import java.util.Map;
/**
 * 目标 : 建立标记的索引视图
 *
 * 说明:  例如对于标记#spark,它出现在tweet1, tweet3, tweet39中。 建立的索引应该返回(#spark, List(tweet1,tweet3, tweet39))
 *
 */
public class InvertedIndex {
  private static String pathToFile = "data/reduced-tweets.json";
  /**
   *  Load the data from the json file and return an RDD of Tweet
   */
  public JavaRDD<Tweet> loadData() {
    // create spark configuration and spark context
    SparkConf conf = new SparkConf()
        .setAppName("Inverted index")
        .set("spark.driver.allowMultipleContexts", "true")
        .setMaster("local[*]");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<Tweet> tweets = sc.textFile(pathToFile).map(new Function<String, Tweet>() {
        public Tweet call(String line) throws Exception {
            // TODO Auto-generated method stub
            return Parse.parseJsonToTweet(line);
        }
    });
    return tweets;
  }
  public Map<String, Iterable<Tweet>> invertedIndex() {
    JavaRDD<Tweet> tweets = loadData();
    // for each tweet, extract all the hashtag and then create couples (hashtag,tweet)
    // Hint: see the flatMapToPair method
    // TODO write code here
    JavaPairRDD<String, Tweet> pairs = null;
    // We want to group the tweets by hashtag
    // TODO write code here
    JavaPairRDD<String, Iterable<Tweet>> tweetsByHashtag = null;
    // Then return the inverted index (= a map structure)
    // TODO write code here
    Map<String, Iterable<Tweet>> map = null;
    return map;
  }
}spark1.4的本地模式编程练习(1)的更多相关文章
- Eclipse的下载、安装和WordCount的初步使用(本地模式和集群模式)
		包括: Eclipse的下载 Eclipse的安装 Eclipse的使用 本地模式或集群模式 Scala IDE for Eclipse的下载.安装和WordCount的初步使用(本地模式和集群 ... 
- IntelliJ IDEA的下载、安装和WordCount的初步使用(本地模式和集群模式)
		包括: IntelliJ IDEA的下载 IntelliJ IDEA的安装 IntelliJ IDEA中的scala插件安装 用SBT方式来创建工程 或 选择Scala方式来创建工程 本地模式或集群 ... 
- IntelliJ IDEA(Community版本)的下载、安装和WordCount的初步使用(本地模式和集群模式)
		不多说,直接上干货! 对于初学者来说,建议你先玩玩这个免费的社区版,但是,一段时间,还是去玩专业版吧,这个很简单哈,学聪明点,去搞到途径激活!可以看我的博客. 包括: IntelliJ IDEA(Co ... 
- Scala IDE for Eclipse的下载、安装和WordCount的初步使用(本地模式和集群模式)
		包括: Scala IDE for Eclipse的下载 Scala IDE for Eclipse的安装 本地模式或集群模式 我们知道,对于开发而言,IDE是有很多个选择的版本.如我们大部分人经常 ... 
- IntelliJ IDEA(Ultimate版本)的下载、安装和WordCount的初步使用(本地模式和集群模式)
		不多说,直接上干货! IntelliJ IDEA号称当前Java开发效率最高的IDE工具.IntelliJ IDEA有两个版本:社区版(Community)和旗舰版(Ultimate).社区版时免费的 ... 
- 大数据入门第八天——MapReduce详解(四)本地模式运行与join实例
		一.本地模式调试MR程序 1.准备 参考之前随笔的windows开发说明处:http://www.cnblogs.com/jiangbei/p/8366238.html 2.流程 最重要的是设置Loc ... 
- 八、hive3.1.2 安装及其配置(本地模式和远程模式)
		目录 前文 hive3.1.2 安装及其配置(本地模式和远程模式) 1.下载hive包 2.修改其环境变量 3.MySQL配置 Centos7 MySQL安装步骤: 1.设置MySQL源 2.安装My ... 
- Hive的三种安装方式(内嵌模式,本地模式远程模式)
		一.安装模式介绍: Hive官网上介绍了Hive的3种安装方式,分别对应不同的应用场景. 1.内嵌模式(元数据保村在内嵌的derby种,允许一个会话链接,尝试多个会话链接时会报错) ... 
- Windows下nodejs 模块配置 全局模式与本地模式的区别
		第1步:下载.安装文件 (nodejs的官网http://www.nodejs.org/download/ ) 第2步:安装相关模块环境 打开C:\Program Files\nodejs 目录你会发 ... 
随机推荐
- android的Looper例子
			直接贴代码 MsgThread.java package bb.aa.looperdemo; import android.os.Handler; import android.os.Looper; ... 
- jsp页面中的问题:Date cannot be resolved to a type
			问题如下:写了一个jsp,提示 症状原因:缺date的jar包 解决办法:在jsp开头导入jar包:<%@ page language="java" import=" ... 
- 打开SDK Manager检查Android SDK下载和更新失败的解决方法
			[故障描述] 打开SDK Manager检查Android SDK状况,出现以下情况: Failed to fetch URL https://dl-ssl.google.com/android/r ... 
- [git]Git常用命令
			转自:http://www.cnblogs.com/idche/archive/2011/07/05/2098165.htmlGIT 学习笔记 集中化的版本控制系统 CVCS(Centralized ... 
- eclipse 每次切换工作空间都要重新配置
			首先,导出T1中的配置打开T1,选择file --> Export --> 在弹出框中选择General 下的preference --> next --> 在export p ... 
- 单元测试(junit使用)
			1.测试的对象是一个类中的方法. 2.导入jar包. 3.单元测试方法时候,测试方法命名规则为 public void 方法名(){},注意:测试类不能命名为public class Test{},T ... 
- 探秘JavaScript中的六个字符
			JavaScript 是一个奇怪而有趣的语言,我们可以写一些疯狂却仍然有效的代码.它试图帮助我们把事情转换到基于我们如何对待他们的特定类型. 如果我们添加一个字符串,JavaScript会假定我们希望 ... 
- MapReduce编程实现学习
			MapReduce主要包括两个阶段:一个是Map,一个是Reduce. 每一步都有key-value对作为输入和输出. Map阶段的key-value对的格式是由输入的格式决定的,如果是默认的Text ... 
- PLSQL_查询SQL的执行次数和频率(案例)
			2014-12-25 Created By BaoXinjian 
- DBA_Oracle Erp版本升级12.1.1到R12.1.3(案例)
			20150506 Created By BaoXinjian 
