本文转载至:

  http://www.aboutyun.com/thread-7358-1-1.html

hadoop涉及输出文本的默认输出编码统一用没有BOM的UTF-8的形式,但是对于中文的输出window系统默认的是GBK,有些格式文件例如CSV格式的文件用excel打开输出编码为没有BOM的UTF-8文件时,输出的结果为乱码,只能由UE或者记事本打开才能正常显示。因此将hadoop默认输出编码更改为GBK成为非常常见的需求。 
      默认的情况下MR主程序中,设定输出编码的设置语句为:

  1. job.setOutputFormatClass(TextOutputFormat.class);

复制代码

  1. TextOutputFormat.class

复制代码

的代码如下:

  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements.  See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership.  The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License.  You may obtain a copy of the License at
  9. *
  10. *     http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.mapreduce.lib.output;
  19. import java.io.DataOutputStream;
  20. import java.io.IOException;
  21. import java.io.UnsupportedEncodingException;
  22. import org.apache.hadoop.classification.InterfaceAudience;
  23. import org.apache.hadoop.classification.InterfaceStability;
  24. import org.apache.hadoop.conf.Configuration;
  25. import org.apache.hadoop.fs.FileSystem;
  26. import org.apache.hadoop.fs.Path;
  27. import org.apache.hadoop.fs.FSDataOutputStream;
  28. import org.apache.hadoop.io.NullWritable;
  29. import org.apache.hadoop.io.Text;
  30. import org.apache.hadoop.io.compress.CompressionCodec;
  31. import org.apache.hadoop.io.compress.GzipCodec;
  32. import org.apache.hadoop.mapreduce.OutputFormat;
  33. import org.apache.hadoop.mapreduce.RecordWriter;
  34. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  35. import org.apache.hadoop.util.*;
  36. /** An {@link OutputFormat} that writes plain text files. */
  37. @InterfaceAudience.Public
  38. @InterfaceStability.Stable
  39. public class TextOutputFormat<K, V> extends FileOutputFormat<K, V> {
  40. public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";
  41. protected static class LineRecordWriter<K, V>
  42. extends RecordWriter<K, V> {
  43. private static final String utf8 = "UTF-8";  // 将UTF-8转换成GBK
  44. private static final byte[] newline;
  45. static {
  46. try {
  47. newline = "\n".getBytes(utf8);
  48. } catch (UnsupportedEncodingException uee) {
  49. throw new IllegalArgumentException("can't find " + utf8 + " encoding");
  50. }
  51. }
  52. protected DataOutputStream out;
  53. private final byte[] keyValueSeparator;
  54. public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
  55. this.out = out;
  56. try {
  57. this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
  58. } catch (UnsupportedEncodingException uee) {
  59. throw new IllegalArgumentException("can't find " + utf8 + " encoding");
  60. }
  61. }
  62. public LineRecordWriter(DataOutputStream out) {
  63. this(out, "\t");
  64. }
  65. /**
  66. * Write the object to the byte stream, handling Text as a special
  67. * case.
  68. * @param o the object to print
  69. * @throws IOException if the write throws, we pass it on
  70. */
  71. private void writeObject(Object o) throws IOException {
  72. if (o instanceof Text) {
  73. Text to = (Text) o;   // 将此行代码注释掉
  74. out.write(to.getBytes(), 0, to.getLength());  // 将此行代码注释掉
  75. } else { // 将此行代码注释掉
  76. out.write(o.toString().getBytes(utf8));
  77. }
  78. }
  79. public synchronized void write(K key, V value)
  80. throws IOException {
  81. boolean nullKey = key == null || key instanceof NullWritable;
  82. boolean nullValue = value == null || value instanceof NullWritable;
  83. if (nullKey && nullValue) {
  84. return;
  85. }
  86. if (!nullKey) {
  87. writeObject(key);
  88. }
  89. if (!(nullKey || nullValue)) {
  90. out.write(keyValueSeparator);
  91. }
  92. if (!nullValue) {
  93. writeObject(value);
  94. }
  95. out.write(newline);
  96. }
  97. public synchronized
  98. void close(TaskAttemptContext context) throws IOException {
  99. out.close();
  100. }
  101. }
  102. public RecordWriter<K, V>
  103. getRecordWriter(TaskAttemptContext job
  104. ) throws IOException, InterruptedException {
  105. Configuration conf = job.getConfiguration();
  106. boolean isCompressed = getCompressOutput(job);
  107. String keyValueSeparator= conf.get(SEPERATOR, "\t");
  108. CompressionCodec codec = null;
  109. String extension = "";
  110. if (isCompressed) {
  111. Class<? extends CompressionCodec> codecClass =
  112. getOutputCompressorClass(job, GzipCodec.class);
  113. codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
  114. extension = codec.getDefaultExtension();
  115. }
  116. Path file = getDefaultWorkFile(job, extension);
  117. FileSystem fs = file.getFileSystem(conf);
  118. if (!isCompressed) {
  119. FSDataOutputStream fileOut = fs.create(file, false);
  120. return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
  121. } else {
  122. FSDataOutputStream fileOut = fs.create(file, false);
  123. return new LineRecordWriter<K, V>(new DataOutputStream
  124. (codec.createOutputStream(fileOut)),
  125. keyValueSeparator);
  126. }
  127. }
  128. }

复制代码

从上述代码的第48行可以看出hadoop已经限定此输出格式统一为UTF-8,因此为了改变hadoop的输出代码的文本编码只需定义一个和TextOutputFormat相同的类GbkOutputFormat同样继承FileOutputFormat(注意是org.apache.hadoop.mapreduce.lib.output.FileOutputFormat)即可,如下代码:

  1. import java.io.DataOutputStream;
  2. import java.io.IOException;
  3. import java.io.UnsupportedEncodingException;
  4. import org.apache.hadoop.classification.InterfaceAudience;
  5. import org.apache.hadoop.classification.InterfaceStability;
  6. import org.apache.hadoop.conf.Configuration;
  7. import org.apache.hadoop.fs.FileSystem;
  8. import org.apache.hadoop.fs.Path;
  9. import org.apache.hadoop.fs.FSDataOutputStream;
  10. import org.apache.hadoop.io.NullWritable;
  11. import org.apache.hadoop.io.Text;
  12. import org.apache.hadoop.io.compress.CompressionCodec;
  13. import org.apache.hadoop.io.compress.GzipCodec;
  14. import org.apache.hadoop.mapreduce.OutputFormat;
  15. import org.apache.hadoop.mapreduce.RecordWriter;
  16. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  17. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  18. import org.apache.hadoop.util.*;
  19. @InterfaceAudience.Public
  20. @InterfaceStability.Stable
  21. public class GbkOutputFormat<K, V> extends FileOutputFormat<K, V> {
  22. public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";
  23. protected static class LineRecordWriter<K, V>
  24. extends RecordWriter<K, V> {
  25. private static final String utf8 = "GBK";
  26. private static final byte[] newline;
  27. static {
  28. try {
  29. newline = "\n".getBytes(utf8);
  30. } catch (UnsupportedEncodingException uee) {
  31. throw new IllegalArgumentException("can't find " + utf8 + " encoding");
  32. }
  33. }
  34. protected DataOutputStream out;
  35. private final byte[] keyValueSeparator;
  36. public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
  37. this.out = out;
  38. try {
  39. this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
  40. } catch (UnsupportedEncodingException uee) {
  41. throw new IllegalArgumentException("can't find " + utf8 + " encoding");
  42. }
  43. }
  44. public LineRecordWriter(DataOutputStream out) {
  45. this(out, "\t");
  46. }
  47. /**
  48. * Write the object to the byte stream, handling Text as a special
  49. * case.
  50. * @param o the object to print
  51. * @throws IOException if the write throws, we pass it on
  52. */
  53. private void writeObject(Object o) throws IOException {
  54. if (o instanceof Text) {
  55. //        Text to = (Text) o;
  56. //        out.write(to.getBytes(), 0, to.getLength());
  57. //      } else {
  58. out.write(o.toString().getBytes(utf8));
  59. }
  60. }
  61. public synchronized void write(K key, V value)
  62. throws IOException {
  63. boolean nullKey = key == null || key instanceof NullWritable;
  64. boolean nullValue = value == null || value instanceof NullWritable;
  65. if (nullKey && nullValue) {
  66. return;
  67. }
  68. if (!nullKey) {
  69. writeObject(key);
  70. }
  71. if (!(nullKey || nullValue)) {
  72. out.write(keyValueSeparator);
  73. }
  74. if (!nullValue) {
  75. writeObject(value);
  76. }
  77. out.write(newline);
  78. }
  79. public synchronized
  80. void close(TaskAttemptContext context) throws IOException {
  81. out.close();
  82. }
  83. }
  84. public RecordWriter<K, V>
  85. getRecordWriter(TaskAttemptContext job
  86. ) throws IOException, InterruptedException {
  87. Configuration conf = job.getConfiguration();
  88. boolean isCompressed = getCompressOutput(job);
  89. String keyValueSeparator= conf.get(SEPERATOR, "\t");
  90. CompressionCodec codec = null;
  91. String extension = "";
  92. if (isCompressed) {
  93. Class<? extends CompressionCodec> codecClass =
  94. getOutputCompressorClass(job, GzipCodec.class);
  95. codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
  96. extension = codec.getDefaultExtension();
  97. }
  98. Path file = getDefaultWorkFile(job, extension);
  99. FileSystem fs = file.getFileSystem(conf);
  100. if (!isCompressed) {
  101. FSDataOutputStream fileOut = fs.create(file, false);
  102. return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
  103. } else {
  104. FSDataOutputStream fileOut = fs.create(file, false);
  105. return new LineRecordWriter<K, V>(new DataOutputStream
  106. (codec.createOutputStream(fileOut)),
  107. keyValueSeparator);
  108. }
  109. }
  110. }

复制代码

最后将输出编码类型设置成GbkOutputFormat.class,如:

  1. job.setOutputFormatClass(GbkOutputFormat.class);

复制代码

参考:

  1. http://semantic.iteye.com/blog/1846238

复制代码

hadoop 输出中文乱码问题的更多相关文章

  1. .Net Core 控制台输出中文乱码

    Net Core 控制台输出中文乱码的解决方法: public static void Main(string[] args)         {             Console.Output ...

  2. 在Servlet中出现一个输出中文乱码的问题(已经解)。

    在Servlet中出现一个输出中文乱码的问题,已经解. @Override public void doPost(HttpServletRequest reqeust, HttpServletResp ...

  3. idea 控制台输出 中文乱码 解决方法

    使用intellij idea 14.1时,console 会输出中文乱码.下面分两种情况解决这种问题:一种是maven构建项目.一种是tomcat(不以maven构建)构建项目. 1.tomcat输 ...

  4. 编码(ACSII unicod UTF-8)、QT输出中文乱码深入分析

    总结: 1. qt输出中文乱码原因分析 qt的编程环境默认是utf-8编码格式(关于编码见下文知识要点一): cout << "中文" << endl; 程 ...

  5. 使用WebLogic时控制台输出中文乱码解决方法

    使用WebLogic时控制台输出中文乱码解决方法 1.找到weblogic安装目录,当前项目配置的domain 2.找到bin下的setDomainEnv.cmd文件 3.打开文件,从文件最后搜索第一 ...

  6. 二十一、IntelliJ IDEA 控制台输出中文乱码问题的解决方法

    首先,找到 IntelliJ IDEA 的安装目录,进入bin目录下,定位到idea.vmoptions文件,如下图所示: 双击打开idea.vmoptions文件,如下图所示: 然后,在其中追加-D ...

  7. 解决phantomjs输出中文乱码

    解决phantomjs输出中文乱码,可以在js文件里添加如下语句: phantom.outputEncoding="gb2312"; // 解决输出乱码

  8. resin后台输出中文乱码的解决办法!

    resin后台输出中文乱码的解决办法! 学习了:https://blog.csdn.net/kobeguang/article/details/34116429 编辑conf/resin.con文件: ...

  9. resin后台输出中文乱码的解决的方法!

    近期从tomcat移植到resin,发现这东西不错啊! 仅仅是后台输出时有时候中文会乱码. 如今找到resin后台输出中文乱码的解决的方法: 编辑conf/resin.con文件: <!--ja ...

随机推荐

  1. HTML学习笔记——常用元素及其属性(二)

    一.HTML表单 -- form标签 -- 与浏览者交互 1.form 标签 -- 代表HTML表单 form标签是成对出现的,以<form>开始,以</form>结束 属性. ...

  2. ibatis 大于等于小于等于的写法

    在ibatis的sql语句xml配置文件中,写sql语句会经常用到大于等于小于等于等等符号.网上搜罗了一些写法,大致有3种: 其实就是xml特殊符号,转义的方式. < < > > ...

  3. JAVA的Hashtable在遍历时的迭代器线程问题

    这篇博客主要讲什么 Hashtable及其内部类的部分源码分析 Hashtable在遍历时的java.util.ConcurrentModificationException异常的来由和解决 单机在内 ...

  4. Hibernate 检索(查询)策略

    1.立即检索 (查询) 立即检索就是在调用get方法的时候,会直接向数据库发出sql语句查询,并将结果放到session缓存中,在查询的时候会将级联的对象一对查出,(发出多条sql语句): 明显的缺点 ...

  5. Android系统移植与调试之------->如何修改Android系统默认显示【开发者选项】并默认打开【USB调试】和【未知来源】开关

    今天有个用户对[设置]有个特殊的要求,即: 1.开机的时候默认显示[开发者选项]并打开[USB调试]开关    ([Developer options]-->[USB debugging]) 2 ...

  6. YAMLException: can not read a block mapping entry; a multiline key may not be an implicit key at line 5, column 1:

    创建的md文件头部声明中没有加空格.

  7. 019-Spring Boot 日志

    一.概述 spring-boot默认支持info级别的日志. 日志级别:trace.debug.info.warn.error.fatal.off[关闭] 二.配置日志级别 2.1.配置文件配置日志级 ...

  8. 3.1 使用STC89C52控制MC20拨打电话

    需要准备的硬件 MC20开发板 1个 https://item.taobao.com/item.htm?id=562661881042 GSM/GPRS天线 1根 https://item.taoba ...

  9. mysq数据库的安装和基本操作

    一.数据库的简介 1.什么是数据库? 数据库(database,DB)是指长期存储在计算机内的,有组织,可共享的数据的集合.数据库中的数据按一定的数学模型组织.描述和存储,具有较小的冗余,较高的数据独 ...

  10. C/C++ 数据类型的使用方法详解

    cppreference.com -> C/C++ 数据类型 C/C++ 数据类型 C语言包含5个基本数据类型: void, integer, float, double, 和 char. 类型 ...