pyspark SparkSession及dataframe基本操作

from pyspark import SparkContext, SparkConf

import os

from pyspark.sql.session import SparkSession

from pyspark.sql import Row

def CreateSparkContex():

	sparkconf = SparkConf().setAppName("MYPRO").set("spark.ui.showConsoleProgress", "false")

	sc = SparkContext(conf=sparkconf)

	print("master:" + sc.master)

	sc.setLogLevel("WARN")

	Setpath(sc)

	spark = SparkSession.builder.config(conf=sparkconf).getOrCreate()

	return sc, spark

def Setpath(sc):

	global Path

	if sc.master[:5] == "local":

		Path = "file:/C:/spark/sparkworkspace"

	else:

		Path = "hdfs://test"

if __name__ == "__main__":

	print("Here we go!\n")

	sc, spark = CreateSparkContex()

	readcsvpath = os.path.join(Path, 'iris.csv')

	dfcsv = spark.read.csv(readcsvpath, header=True,

	schema=("`Sepal.Length` DOUBLE,`Sepal.Width` DOUBLE,`Petal.Length` DOUBLE,`Petal.Width` DOUBLE,`Species` string"))

	#指定数据类型读取

	dfcsv.show(3)

	dfcsv.registerTempTable('Iris')#创建并登陆临时表

	spark.sql("select * from Iris limit 3").show()#使用sql语句查询

	spark.sql("select Species,count(1) from Iris group by Species").show()

	df = dfcsv.alias('Iris1')#创建一个别名

	df.select('Species', '`Sepal.Width`').show(4)#因表头有特殊字符需用反引号``转义

	df.select(df.Species,df['`Sepal.Width`']).show(4)

	dfcsv.select(df.Species).show(4)#原始名、别名的组合

	df[df.Species, df['`Sepal.Width`']].show(4)

	df[['Species']]#与pandas相同

	df['Species']#注意这是一个字段名

	#########增加字段

	df[df['`Sepal.Length`'], df['`Sepal.Width`'], df['`Sepal.Length`'] - df['`Sepal.Width`']].show(4)

	df[df['`Sepal.Length`'], df['`Sepal.Width`'],

	   (df['`Sepal.Length`'] - df['`Sepal.Width`']).alias('rua')].show(4)#重命名

	#########筛选数据

	df[df.Species == 'virginica'].show(4)#与pandas筛选一样

	df[(df.Species == 'virginica') & (df['`Sepal.Width`']>1)].show(4)#多条件筛选

	df.filter(df.Species == 'virginica').show(4)#也可以用fileter方法筛选

	spark.sql("select * from Iris where Species='virginica'").show(4)#sql筛选

	##########多字段排序

	spark.sql("select * from Iris order by `Sepal.Length` asc ").show(4)#升序

	spark.sql("select * from Iris order by `Sepal.Length` desc ").show(4)#降序

	spark.sql("select * from Iris order by `Sepal.Length` asc,`Sepal.Width` desc ").show(4)#升降序

	df.select('`Sepal.Length`', '`Sepal.Width`').orderBy('`Sepal.Width`',ascending=0).show(4)#按降序

	df.select('`Sepal.Length`', '`Sepal.Width`').orderBy('`Sepal.Width`').show(4)  # 升序

	df.select('`Sepal.Length`', '`Sepal.Width`').orderBy('`Sepal.Width`', ascending=1).show(4)  # 按升序，默认的

	df.select('`Sepal.Length`', '`Sepal.Width`').orderBy(df['`Sepal.Width`'].desc()).show(4)  # 按降序

	df.select('`Sepal.Length`', '`Sepal.Width`').orderBy(

		['`Sepal.Length`','`Sepal.Width`'], ascending=[0,1]).show(4)#两个字段按先降序再升序

	df.orderBy(df['`Sepal.Length`'].desc(),df['`Sepal.Width`']).show(4)

	##########去重

	spark.sql("select distinct Species from Iris").show()

	spark.sql("select distinct Species,`Sepal.Width` from Iris").show()

	df.select('Species').distinct().show()

	df.select('Species','`Sepal.Width`').distinct().show()

	df.select('Species').drop_duplicates().show()#同上，与pandas用法相同

	df.select('Species').dropDuplicates().show()#同上

	##########分组统计

	spark.sql("select Species,count(1) from Iris group by Species").show()

	df[['Species']].groupby('Species').count().show()

	df.groupby(['Species']).agg({'`Sepal.Width`': 'sum'}).show()

	df.groupby(['Species']).agg({'`Sepal.Width`': 'sum', '`Sepal.Length`': 'mean'}).show()

	#########联结数据

	dic=[['virginica','A1'],['versicolor','A2'],['setosa','A3']]

	rrd=sc.parallelize(dic)

	df2=rrd.map(lambda p: Row(lei=p[0],al=p[1]))

	df2frame=spark.createDataFrame(df2)

	df2frame.show()

	df2frame.registerTempTable('dictable')

	spark.sql("select * from Iris u left join dictable z on u.Species=z.lei").show()

	df.join(df2frame, df.Species == df2frame.lei, 'left_outer').show()

	sc.stop()

	spark.stop()

pyspark SparkSession及dataframe基本操作的更多相关文章

DataFrame基本操作
这些操作在网上都可以百度得到,为了便于记忆自己再根据理解总结在一起.---------励志做一个优雅的网上搬运工 1.建立dataframe (1)Dict to Dataframe df = pd. ...
将数据从数据库直接通过 pyspark 读入到dataframe
from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .appName("Python Spark S ...
[Spark SQL] SparkSession、DataFrame 和 DataSet 练习
本課主題 DataSet 实战 DataSet 实战 SparkSession 是 SparkSQL 的入口,然后可以基于 sparkSession 来获取或者是读取源数据来生存 DataFrameR ...
python做数据分析pandas库介绍之DataFrame基本操作
怎样删除list中空字符? 最简单的方法:new_list = [ x for x in li if x != '' ] 这一部分主要学习pandas中基于前面两种数据结构的基本操作. 设有DataF ...
用python做数据分析pandas库介绍之DataFrame基本操作
怎样删除list中空字符? 最简单的方法:new_list = [ x for x in li if x != '' ] 这一部分主要学习pandas中基于前面两种数据结构的基本操作. 设有DataF ...
pandas库介绍之DataFrame基本操作
怎样删除list中空字符? 最简单的方法:new_list = [ x for x in li if x != '' ] 今天是5.1号. 这一部分主要学习pandas中基于前面两种数据结构的基本操作 ...
用python做数据分析4|pandas库介绍之DataFrame基本操作
原文地址怎样删除list中空字符? 最简单的方法:new_list = [ x for x in li if x != '' ] 今天是5.1号. 这一部分主要学习pandas中基于前面两种数据结构 ...
机器学习三剑客之Pandas中DataFrame基本操作
Pandas 是基于Numpy 的一种工具,是为了解决数据分析任务而创建的.Pandas 纳入了大量库和一些标准的数据模型,提供了高效地操作大型数据集所需的工具.Pandas提供了大量能使我们快速便捷 ...
sparksession创建DataFrame方式
spark创建dataFrame方式有很多种,官方API也比较多公司业务上的个别场景使用了下面两种方式 1.通过List创建dataFrame /** * Applies a schema to a ...

随机推荐

【转】ANDROID自定义视图——onMeasure，MeasureSpec源码流程思路详解
原文地址:http://blog.csdn.net/a396901990/article/details/36475213 简介: 在自定义view的时候,其实很简单,只需要知道3步骤: 1.测量—— ...
.net core webapi 文件上传在 Swagger 文档中的有好提示处理
前提: 需要nuget Swashbuckle.AspNetCore 我暂时用的是 4.01 最新版本: 描述:解决 .net core webapi 上传文件使用的是 IFormFile,在S ...
C#NPOI.RabbitMQ.EF.Attribute.HttpRuntime.Cache.AD域.List<T>根据指定字段去重.前端JQuery.Cache.I18N(多语言).data-xx(自定义属性)
使用NPOI 操作Excel 个人使用的电脑基本默认安装Excel 操作起来调用Excel的组件便可.如果是一台服务器.没有安装Excel,也就无法调用Excel组件. 在此推荐第三方插件.NPOI ...
2018年第九届蓝桥杯C/C++A组省赛（最后一题）
第十题付账问题 [题目描述] 几个人一起出去吃饭是常有的事.但在结帐的时候,常常会出现一些争执. 现在有 n 个人出去吃饭,他们总共消费了 S 元.其中第 i 个人带了 ai 元.幸 ...
C语言宏的定义和宏的使用方法（#define）
1.宏的功能介绍在 C 语言中,可以采用命令 #define 来定义宏.该命令允许把一个名称指定成任何所需的文本,例如一个常量值或者一条语句.在定义了宏之后,无论宏名称出现在源代码的何处,预处理器都 ...
用C语言构建一个可执行程序的流程
1.流程图从用C语言写源代码,然后经过编译器.连接器到最终可执行程序的流程图大致如下图所示. 2.编译流程首先,我们先用C语言把源代码写好,然后交给C语言编译器.C语言编译器内部分为前端和后端. ...
使用C语言封装数组，动态实现增删改查
myArray.h : #pragma once //包含的时候只包含一次 #include <stdio.h> #include <stdlib.h> #include &l ...
MyBatis配置文件mybatis-config.xml
<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE configuration PUBLIC & ...
16、OpenCV Python 腐蚀和彭胀
__author__ = "WSX" import cv2 as cv import numpy as np def erode_demo(image): print(image. ...
php Tp5下mysql的增删改查
// 增 public function insert(){ $data = array( "username"=>"user121", "pa ...

pyspark SparkSession及dataframe基本操作

pyspark SparkSession及dataframe基本操作的更多相关文章

随机推荐

热门专题