Spark Dataset DataFrame空值null,NaN判断和处理

import org.apache.spark.sql.SparkSession

import org.apache.spark.sql.Dataset

import org.apache.spark.sql.Row

import org.apache.spark.sql.DataFrame

import org.apache.spark.sql.Column

import org.apache.spark.sql.DataFrameReader

import org.apache.spark.rdd.RDD

import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

import org.apache.spark.sql.Encoder

import org.apache.spark.sql.functions._

import org.apache.spark.sql.DataFrameStatFunctions

import org.apache.spark.ml.linalg.Vectors

math.sqrt(-1.0)

res43: Double = NaN

math.sqrt(-1.0).isNaN()

res44: Boolean = true

val data1 = data.toDF("affairs", "gender", "age", "yearsmarried", "children", "religiousness", "education", "occupation", "rating")

data1: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

data1.limit(10).show

+-------+------+---+------------+--------+-------------+---------+----------+------+

|affairs|gender|age|yearsmarried|children|religiousness|education|occupation|rating|

+-------+------+---+------------+--------+-------------+---------+----------+------+

|      0|  male| 37|          10|      no|            3|       18|         7|     4|

|      0|  null| 27|        null|      no|            4|       14|         6|  null|

|      0|  null| 32|        null|     yes|            1|       12|         1|  null|

|      0|  null| 57|        null|     yes|            5|       18|         6|  null|

|      0|  null| 22|        null|      no|            2|       17|         6|  null|

|      0|  null| 32|        null|      no|            2|       17|         5|  null|

|      0|female| 22|        null|      no|            2|       12|         1|  null|

|      0|  male| 57|          15|     yes|            2|       14|         4|     4|

|      0|female| 32|          15|     yes|            4|       16|         1|     2|

|      0|  male| 22|         1.5|      no|            4|       14|         4|     5|

+-------+------+---+------------+--------+-------------+---------+----------+------+

 // 删除所有列的空值和NaN

val resNull=data1.na.drop()

resNull: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

 resNull.limit(10).show()

+-------+------+---+------------+--------+-------------+---------+----------+------+

|affairs|gender|age|yearsmarried|children|religiousness|education|occupation|rating|

+-------+------+---+------------+--------+-------------+---------+----------+------+

|      0|  male| 37|          10|      no|            3|       18|         7|     4|

|      0|  male| 57|          15|     yes|            2|       14|         4|     4|

|      0|female| 32|          15|     yes|            4|       16|         1|     2|

|      0|  male| 22|         1.5|      no|            4|       14|         4|     5|

|      0|  male| 37|          15|     yes|            2|       20|         7|     2|

|      0|  male| 27|           4|     yes|            4|       18|         6|     4|

|      0|  male| 47|          15|     yes|            5|       17|         6|     4|

|      0|female| 22|         1.5|      no|            2|       17|         5|     4|

|      0|female| 27|           4|      no|            4|       14|         5|     4|

|      0|female| 37|          15|     yes|            1|       17|         5|     5|

+-------+------+---+------------+--------+-------------+---------+----------+------+

 //删除某列的空值和NaN

val res=data1.na.drop(Array("gender","yearsmarried"))

// 删除某列的非空且非NaN的低于10的

data1.na.drop(10,Array("gender","yearsmarried"))

 //填充所有空值的列

val res123=data1.na.fill("wangxiao123")

res123: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

 res123.limit(10).show()

+-------+-----------+---+------------+--------+-------------+---------+----------+-----------+

|affairs|     gender|age|yearsmarried|children|religiousness|education|occupation|     rating|

+-------+-----------+---+------------+--------+-------------+---------+----------+-----------+

|      0|       male| 37|          10|      no|            3|       18|         7|          4|

|      0|wangxiao123| 27| wangxiao123|      no|            4|       14|         6|wangxiao123|

|      0|wangxiao123| 32| wangxiao123|     yes|            1|       12|         1|wangxiao123|

|      0|wangxiao123| 57| wangxiao123|     yes|            5|       18|         6|wangxiao123|

|      0|wangxiao123| 22| wangxiao123|      no|            2|       17|         6|wangxiao123|

|      0|wangxiao123| 32| wangxiao123|      no|            2|       17|         5|wangxiao123|

|      0|     female| 22| wangxiao123|      no|            2|       12|         1|wangxiao123|

|      0|       male| 57|          15|     yes|            2|       14|         4|          4|

|      0|     female| 32|          15|     yes|            4|       16|         1|          2|

|      0|       male| 22|         1.5|      no|            4|       14|         4|          5|

+-------+-----------+---+------------+--------+-------------+---------+----------+-----------+

 //对指定的列空值填充

 val res2=data1.na.fill(value="wangxiao111",cols=Array("gender","yearsmarried") )

res2: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

 res2.limit(10).show()

+-------+-----------+---+------------+--------+-------------+---------+----------+------+

|affairs|     gender|age|yearsmarried|children|religiousness|education|occupation|rating|

+-------+-----------+---+------------+--------+-------------+---------+----------+------+

|      0|       male| 37|          10|      no|            3|       18|         7|     4|

|      0|wangxiao111| 27| wangxiao111|      no|            4|       14|         6|  null|

|      0|wangxiao111| 32| wangxiao111|     yes|            1|       12|         1|  null|

|      0|wangxiao111| 57| wangxiao111|     yes|            5|       18|         6|  null|

|      0|wangxiao111| 22| wangxiao111|      no|            2|       17|         6|  null|

|      0|wangxiao111| 32| wangxiao111|      no|            2|       17|         5|  null|

|      0|     female| 22| wangxiao111|      no|            2|       12|         1|  null|

|      0|       male| 57|          15|     yes|            2|       14|         4|     4|

|      0|     female| 32|          15|     yes|            4|       16|         1|     2|

|      0|       male| 22|         1.5|      no|            4|       14|         4|     5|

+-------+-----------+---+------------+--------+-------------+---------+----------+------+

val res3=data1.na.fill(Map("gender"->"wangxiao222","yearsmarried"->"wangxiao567") )

res3: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

 res3.limit(10).show()

+-------+-----------+---+------------+--------+-------------+---------+----------+------+

|affairs|     gender|age|yearsmarried|children|religiousness|education|occupation|rating|

+-------+-----------+---+------------+--------+-------------+---------+----------+------+

|      0|       male| 37|          10|      no|            3|       18|         7|     4|

|      0|wangxiao222| 27| wangxiao567|      no|            4|       14|         6|  null|

|      0|wangxiao222| 32| wangxiao567|     yes|            1|       12|         1|  null|

|      0|wangxiao222| 57| wangxiao567|     yes|            5|       18|         6|  null|

|      0|wangxiao222| 22| wangxiao567|      no|            2|       17|         6|  null|

|      0|wangxiao222| 32| wangxiao567|      no|            2|       17|         5|  null|

|      0|     female| 22| wangxiao567|      no|            2|       12|         1|  null|

|      0|       male| 57|          15|     yes|            2|       14|         4|     4|

|      0|     female| 32|          15|     yes|            4|       16|         1|     2|

|      0|       male| 22|         1.5|      no|            4|       14|         4|     5|

+-------+-----------+---+------------+--------+-------------+---------+----------+------+

 //查询空值列

data1.filter("gender is null").select("gender").limit(10).show

+------+

|gender|

+------+

|  null|

|  null|

|  null|

|  null|

|  null|

+------+

 data1.filter("gender is not null").select("gender").limit(10).show

+------+

|gender|

+------+

|  male|

|female|

|  male|

|female|

|  male|

|  male|

|  male|

|  male|

|female|

|female|

+------+

 data1.filter( data1("gender").isNull ).select("gender").limit(10).show

+------+

|gender|

+------+

|  null|

|  null|

|  null|

|  null|

|  null|

+------+

 data1.filter("gender<>''").select("gender").limit(10).show

+------+

|gender|

+------+

|  male|

|female|

|  male|

|female|

|  male|

|  male|

|  male|

|  male|

|female|

|female|

+------+

Spark Dataset DataFrame空值null,NaN判断和处理的更多相关文章

Spark2 Dataset DataFrame空值null,NaN判断和处理
import org.apache.spark.sql.SparkSession import org.apache.spark.sql.Dataset import org.apache.spark ...
Spark Dataset DataFrame 操作
Spark Dataset DataFrame 操作相关博文参考 sparksql中dataframe的用法一.Spark2 Dataset DataFrame空值null,NaN判断和处理 1. ...
oracle中空值null的判断和转换：NVL的用法
1.NULL空值概念数据库里有一个很重要的概念:空值即NULL.有时表中,更确切的说是某些字段值,可能会出现空值, 这是因为这个数据不知道是什么值或根本就不存在. 2.NULL空值判断空值不等同于 ...
dataframe去除null、NaN和空字符串
去除null.NaN 去除 dataframe 中的 null . NaN 有方法 drop ,用 dataframe.na 找出带有 null. NaN 的行,用 drop 删除行: import ...
Spark提高篇——RDD/DataSet/DataFrame（二）
该部分分为两篇,分别介绍RDD与Dataset/DataFrame: 一.RDD 二.DataSet/DataFrame 该篇主要介绍DataSet与DataFrame. 一.生成DataFrame ...
js判断undefined类型,undefined,null,NaN的区别
js判断undefined类型今天使用showModalDialog打开页面,返回值时.当打开的页面点击关闭按钮或直接点浏览器上的关闭则返回值是undefined 所以自作聪明判断 ...
Javascript 中的非空判断 undefined,null, NaN的区别
JS 数据类型在介绍这三个之间的差别之前, 先来看一下JS 的数据类型. 在 Java ,C这样的语言中, 使用一个变量之前,需要先定义这个变量并指定它的数据类型,是整型,字符串型,.... 但是 ...
(转载)Javascript 中的非空判断 undefined,null, NaN的区别
原文地址:https://blog.csdn.net/oscar999/article/details/9353713 在介绍这三个之间的差别之前, 先来看一下JS 的数据类型. 在 Java ,C ...
Spark提高篇——RDD/DataSet/DataFrame（一）
该部分分为两篇,分别介绍RDD与Dataset/DataFrame: 一.RDD 二.DataSet/DataFrame 先来看下官网对RDD.DataSet.DataFrame的解释: 1.RDD ...

随机推荐

分布式零基础之--分布式CAP理论
研究到分布式系统CAP理论,记录下来下回详细分析它: CAP是指三个单词的简称 C: 一致性(Consistence) 所有节点访问的都是同一份最新的数据副本. A: 可用性(Availability ...
CVE-2021-3019 漏洞细节纰漏
CVE编号 CVE-2021-3019 lanproxy任意文件读取该漏洞是2021年比较新的漏洞是否需要认证:否是否执行远程代码:否是否执行远程命令:否数据读取是否内网:否漏洞软件介绍 ...
linux根文件系统 /etc/resolv.conf 文件详解
Linux根文件系统/etc/resolv.conf文件,它是DNS客户机配置文件,用于设置DNS服务器的IP地址及DNS域名,还包含了主机的域名搜索顺序.该文件是由域名解析器(resolver,以恶 ...
LeetCode 124 二叉树中最大路径和
题目: 给定一个非空二叉树,返回其最大路径和. 本题中,路径被定义为一条从树中任意节点出发,达到任意节点的序列.该路径至少包含一个节点,且不一定经过根节点. 思路:递归分为三部分,根节点,左子树,右 ...
第2章 HTML中的JavaScript
目录 1. script标签 1.1 标签位置 1.2 defer推迟执行脚本 1.3 async异步执行脚本 1.4 动态加载脚本 2. noscript标签 1. script标签 <scr ...
PHP SDK短信接口
/** * sdk 短信接口 * @param $tel 手机号 * @param $content 短信内容 * @return bool */ public function telSDK($te ...
Unity优化 1
浅谈Unity中的GC以及优化(转) Unity 官方文档,正巧在博客园发现了已经有位大神(zblade)把原文翻译出来了,而且质量很高~,译文地址在这里.下面我就可耻地把译文搬运了过来,作为上面思 ...
servlet+jsp完成简单登录
将用户在注册界面中的数据填充到数据库相对应的表格中.当用户再次登录时,从数据库中拿到相应的数据查询并与页面的数据做对比,判断是否登陆成功. 需要在HTML文件中将form表单上的action属性值设置 ...
关于spring-data与elasticsearch的使用，自定义repository
之前没有使用过spring-data,关于spring-data有很多很棒的设计,例如仅仅只需要声明一个接口就行,你甚至都不需要去实现,spring-data有内置默认的实现类,基本就上完成绝大多数对 ...
【Linux】nohup和&的区别
同样都是后台执行进程,但是nohup和&有什么区别呢? & 是指后台运行: nohup 的功能和& 之间的功能并不相同. 其中,nohup 可以使得命令永远运行下去和用户终端没 ...

Spark Dataset DataFrame空值null,NaN判断和处理

Spark Dataset DataFrame空值null,NaN判断和处理

Spark Dataset DataFrame空值null,NaN判断和处理的更多相关文章

随机推荐

热门专题