mapreduce 读写Parquet格式数据 Demo

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.parquet.example.data.Group;

import org.apache.parquet.example.data.simple.SimpleGroupFactory;

import org.apache.parquet.hadoop.ParquetInputFormat;

import org.apache.parquet.hadoop.ParquetOutputFormat;

import org.apache.parquet.hadoop.example.GroupReadSupport;

import org.apache.parquet.hadoop.example.GroupWriteSupport;

import org.apache.parquet.schema.MessageType;

import org.apache.parquet.schema.OriginalType;

import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;

import org.apache.parquet.schema.Types;

/**

 * MR Parquet格式数据读写Demo

 */

public class ParquetReaderAndWriteMRDemo {

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        String[] otherargs=new GenericOptionsParser(conf, args).getRemainingArgs();

        if(otherargs.length!=3){

            System.out.println("<in> <out> 1");

            System.out.println("<parquet-in> <out> 2");

            System.out.println("<in> <parquet-out> 3");

            System.out.println("<parquet-in> <parquet-out> 4");

            System.exit(2);

        }

        //此demo 输入数据为2列     city  ip

        MessageType schema = Types.buildMessage()

                   .required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("city")

                   .required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("ip")

                   .named("pair");

        System.out.println("[schema]=="+schema.toString());

        GroupWriteSupport.setSchema(schema, conf);

        Job job = Job.getInstance(conf, "ParquetReadMR");

        job.setJarByClass(ParquetReaderAndWriteMRDemo.class);

        if(otherargs[2].equals("1")){

            job.setMapperClass(NormalMapper.class);

            job.setReducerClass(NormalReducer.class);

            job.setOutputKeyClass(Text.class);

            job.setOutputValueClass(Text.class);

            FileInputFormat.setInputPaths(job,otherargs[0] );

            FileOutputFormat.setOutputPath(job, new Path(otherargs[1]));

            if (!job.waitForCompletion(true))

                return;

        }

        if(otherargs[2].equals("3")){

            job.setMapperClass(ParquetWriteMapper.class);

            job.setNumReduceTasks(0);

            FileInputFormat.setInputPaths(job,otherargs[0] );

            //parquet输出

            job.setOutputFormatClass(ParquetOutputFormat.class);

            ParquetOutputFormat.setWriteSupportClass(job, GroupWriteSupport.class);

//            ParquetOutputFormat.setOutputPath(job, new Path(otherargs[1]));

            FileOutputFormat.setOutputPath(job, new Path(otherargs[1]));

            if (!job.waitForCompletion(true))

                return;

        }

        if(otherargs[2].equals("2")){

            //parquet输入

            job.setMapperClass(ParquetReadMapper.class);

            job.setNumReduceTasks(0);

            job.setInputFormatClass(ParquetInputFormat.class);

            ParquetInputFormat.setReadSupportClass(job, GroupReadSupport.class);

            job.setOutputKeyClass(Text.class);

            job.setOutputValueClass(Text.class);

            FileInputFormat.setInputPaths(job,otherargs[0] );

            FileOutputFormat.setOutputPath(job, new Path(otherargs[1]));

            if (!job.waitForCompletion(true))

                return;

        }

        if(otherargs[2].equals("4")){

            //TODO 不想写了

        }

    }

    public static class ParquetWriteMapper extends Mapper<LongWritable, Text, Void, Group> {

        SimpleGroupFactory factory=null;

        protected void setup(Context context) throws IOException ,InterruptedException {

            factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(context.getConfiguration()));

        };

        public void map(LongWritable _key, Text ivalue, Context context) throws IOException, InterruptedException {

            Group pair=factory.newGroup();

            String[] strs=ivalue.toString().split("\\s+");

            pair.append("city", strs[0]);

            pair.append("ip", strs[1]);

            context.write(null,pair);

        }

    }

    public static class ParquetReadMapper extends Mapper<Void, Group, Text, Text> {

        public void map(Void _key, Group group, Context context) throws IOException, InterruptedException {

            String city=group.getString(0, 0);

            String ip=group.getString(1, 0);

            context.write(new Text(city),new Text(ip));

        }

    }

    public static class NormalMapper extends Mapper<LongWritable, Text, Text, Text> {

        public void map(LongWritable ikey, Text ivalue, Context context) throws IOException, InterruptedException {

            String[] strs=ivalue.toString().split("\\s+");

            context.write(new Text(strs[0]), new Text(strs[1]));

        }

    }

        public static class NormalReducer extends Reducer<Text, Text, Text, Text> {

            public void reduce(Text _key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

                for (Text text : values) {

                    context.write(_key,text);

                }

            }

        }

}

mapreduce 读写Parquet格式数据 Demo的更多相关文章

java 读写Parquet格式的数据 Parquet example
import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOExce ...
Hive 导入 parquet 格式数据（未完，待续）
Hive 导入 parquet 格式数据 Parquet 格式文件,查看Schema Parquet 之mapreduce Hive 导入 parquet 格式数据
Hive 导入 parquet 格式数据
Hive 导入 parquet 数据步骤如下: 查看 parquet 文件的格式构造建表语句倒入数据一.查看 parquet 内容和结构下载地址社区工具 GitHub 地址命令查看结构: ...
matlab 读写其他格式数据文件（excel）
1. excel matlab和excel 中的数据互相导入 xlswrite() mat ⇒ excel 请问怎么把大容量的mat文件导出到excel文件中 – MATLAB中文论坛 % data. ...
Android读写JSON格式的数据之JsonWriter和JsonReader
近期的好几个月都没有搞Android编程了,逐渐的都忘却了一些东西.近期打算找一份Android的工作,要继续拾起曾经的东西.公司月初搬家之后就一直没有网络,直到今日公司才有网络接入,各部门才開始办公 ...
大数据学习day25------spark08-----1. 读取数据库的形式创建DataFrame 2. Parquet格式的数据源 3. Orc格式的数据源 4.spark_sql整合hive 5.在IDEA中编写spark程序（用来操作hive） 6. SQL风格和DSL风格以及RDD的形式计算连续登陆三天的用户
1. 读取数据库的形式创建DataFrame DataFrameFromJDBC object DataFrameFromJDBC { def main(args: Array[String]): U ...
Hadoop 中利用 mapreduce 读写 mysql 数据
Hadoop 中利用 mapreduce 读写 mysql 数据有时候我们在项目中会遇到输入结果集很大,但是输出结果很小,比如一些 pv.uv 数据,然后为了实时查询的需求,或者一些 OLAP ...
spark DataFrame 读写和保存数据
一.读写Parquet(DataFrame) Spark SQL可以支持Parquet.JSON.Hive等数据源,并且可以通过JDBC连接外部数据源.前面的介绍中,我们已经涉及到了JSON.文本格式 ...
Parquet 格式文件
Apache Parquet是Hadoop生态圈中一种新型列式存储格式,它可以兼容Hadoop生态圈中大多数计算框架(Hadoop.Spark等),被多种查询引擎支持(Hive.Impala.Dril ...

随机推荐

禁用Chrome的“请停用以开发者模式运行的扩展程序”提示
1.前言每次启动都会有一个烦人的“请停用以开发者模式运行的扩展程序”提示,这个提示有多烦人,接触过的人都知道,启动的时候它不立即提示,等过了几秒钟等你打开某个网页开始执行某些操作时它突然弹出来干扰你 ...
第十一次作业 LL(1)文法的判断，递归下降分析程序
1. 文法 G(S): (1)S -> AB (2)A ->Da|ε (3)B -> cC (4)C -> aADC |ε (5)D -> b|ε 验证文法 G(S)是不 ...
pandas的使用（4）
pandas的使用(4)--文件读取和保存
2018 ACM-ICPC徐州站网络赛 G题
There's a beach in the first quadrant. And from time to time, there are sea waves. A wave ( xxx , yy ...
openresty 报错：lua entry thread aborted: runtime error
[1]问题现象 (1)本地openresty系统 (2)报错信息 2019/09/10 08:13:55 [error] 2385#2385: *4 lua entry thread aborted: ...
mysql获取日期语句汇总
汇总一些MySQL获取日期的SQL语句. -- 今天 SELECT DATE_FORMAT(NOW(),'%Y-%m-%d 00:00:00') AS '今天开始'; SELECT DATE_FORM ...
Android ADB 实用总结
一.背景从系统架构上来说,Android是基于Linux系统基础上,做了进一步的定制与修改,并融入了自身的特有功能,且向应用层提供应用程序接口,供开发者使用.系统内核层面,主体依然是Linux内核. ...
Blend 阴影倒影模糊效果
原文:Blend 阴影倒影模糊效果 1)阴影和模糊效果很简单在Blend的面板效果中就有体现直接拖拽到控件即可 2)文本加圆角需要一个布局控件Border 设置属性CornerRadius ...
VS报错，Metadata file 'xxx.dll' could not be found
错误提示“Metadata file 'xxx.dll' could not be found”步骤如下:1.右键单击解决方案,然后单击“属性”.2.单击左侧的配置.3.确保选中了它找不到的项目的“生 ...
添加wcf服务引用,无法签出当前文件
写了一些wcf服务接口,使用控制台可以正常启动服务,想要测试一下,新建项目添加服务引用,提示:“无法签出当前文件.该文件可能为只读或已锁定,或者您需要手动签出它.” 在网上找了找,有说可能是因为源代码 ...

mapreduce 读写Parquet格式数据 Demo

mapreduce 读写Parquet格式数据 Demo的更多相关文章

随机推荐

热门专题