有5个基础的函数: 
- filter 
- select 
- arrange 
- mutate 
- summarise 
- group_by (plus)

可以和databases以及data tables中的数据打交道。

plyr包的特点

其基础函数有以下特点:

  1. 第一个参数df
  2. 返回df
  3. 没有数据更改in place

正是因为有这些特点,才可以使用%>%操作符,方便逻辑式编程。

载入数据

library(plyr)
library(dplyr) # load packages
suppressMessages(library(dplyr))
install.packages("hflights")
library(hflights)
# explore data
data(hflights)
head(hflights)
# convert to local data frame
flights <- tbl_df(hflights)
# printing only shows 10 rows and as many columns as can fit on your screen
flights
# you can specify that you want to see more rows
print(flights, n=20)
# convert to a normal data frame to see all of the columns
data.frame(head(flights))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18

filter

keep rows matching criteria

# base R approach to view all flights on January 1
flights[flights$Month==1 & flights$DayofMonth==1, ]
# dplyr approach
# note: you can use comma or ampersand to represent AND condition
filter(flights, Month==1, DayofMonth==1)
# use pipe for OR condition
filter(flights, UniqueCarrier=="AA" | UniqueCarrier=="UA")
# you can also use %in% operator
filter(flights, UniqueCarrier %in% c("AA", "UA"))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9

select

pick columns by name

# base R approach to select DepTime, ArrTime, and FlightNum columns
flights[, c("DepTime", "ArrTime", "FlightNum")]
# dplyr approach
select(flights, DepTime, ArrTime, FlightNum)
# use colon to select multiple contiguous columns, and use `contains` to match columns by name
# note: `starts_with`, `ends_with`, and `matches` (for regular expressions) can also be used to match columns by name
select(flights, Year:DayofMonth, contains("Taxi"), contains("Delay"))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8

“chaining” or “pipelining”

# nesting method to select UniqueCarrier and DepDelay columns and filter for delays over 60 minutes
filter(select(flights, UniqueCarrier, DepDelay), DepDelay > 60)
# chaining method
flights %>%
select(UniqueCarrier, DepDelay) %>%
filter(DepDelay > 60) # create two vectors and calculate Euclidian distance between them
x1 <- 1:5; x2 <- 2:6
sqrt(sum((x1-x2)^2))
# chaining method
(x1-x2)^2 %>% sum() %>% sqrt()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12

arrange

reorder rows

# base R approach to select UniqueCarrier and DepDelay columns and sort by DepDelay
flights[order(flights$DepDelay), c("UniqueCarrier", "DepDelay")]
# dplyr approach
flights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(DepDelay)
# use `desc` for descending
flights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(desc(DepDelay))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10

mutate

add new variable 
create new variables that are functions of exciting variables 
which is d 
ifferent form transform

# base R approach to create a new variable Speed (in mph)
flights$Speed <- flights$Distance / flights$AirTime*60
flights[, c("Distance", "AirTime", "Speed")]
# dplyr approach (prints the new variable but does not store it)
flights %>%
select(Distance, AirTime) %>%
mutate(Speed = Distance/AirTime*60)
# store the new variable
flights <- flights %>% mutate(Speed = Distance/AirTime*60)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10

summarise

reduce variables to values

# base R approaches to calculate the average arrival delay to each destination
head(with(flights, tapply(ArrDelay, Dest, mean, na.rm=TRUE)))
head(aggregate(ArrDelay ~ Dest, flights, mean))
# dplyr approach: create a table grouped by Dest, and then summarise each group by taking the mean of ArrDelay
flights %>%
group_by(Dest) %>%
summarise(avg_delay = mean(ArrDelay, na.rm=TRUE))
#summarise_each allows you to apply the same summary function to multiple columns at once
#Note: mutate_each is also available
# for each carrier, calculate the percentage of flights cancelled or diverted
flights %>%
group_by(UniqueCarrier) %>%
summarise_each(funs(mean), Cancelled, Diverted)
# for each carrier, calculate the minimum and maximum arrival and departure delays
flights %>%
group_by(UniqueCarrier) %>%
summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), matches("Delay"))
#Helper function n() counts the number of rows in a group
#Helper function n_distinct(vector) counts the number of unique items in that vector
# for each day of the year, count the total number of flights and sort in descending order
flights %>%
group_by(Month, DayofMonth) %>%
summarise(flight_count = n()) %>%
arrange(desc(flight_count))
# rewrite more simply with the `tally` function
flights %>%
group_by(Month, DayofMonth) %>%
tally(sort = TRUE)
# for each destination, count the total number of flights and the number of distinct planes that flew there
flights %>%
group_by(Dest) %>%
summarise(flight_count = n(), plane_count = n_distinct(TailNum))
# Grouping can sometimes be useful without summarising
# for each destination, show the number of cancelled and not cancelled flights
flights %>%
group_by(Dest) %>%
select(Cancelled) %>%
table() %>%
head()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40

Window Functions

  • Aggregation function (like mean) takes n inputs and returns 1 value
  • Window function takes n inputs and returns n values 
    Includes ranking and ordering functions (like min_rank), offset functions (lead and lag), and cumulative aggregates (like cummean).
# for each carrier, calculate which two days of the year they had their longest departure delays
# note: smallest (not largest) value is ranked as 1, so you have to use `desc` to rank by largest value
flights %>%
group_by(UniqueCarrier) %>%
select(Month, DayofMonth, DepDelay) %>%
filter(min_rank(desc(DepDelay)) <= 2) %>%
arrange(UniqueCarrier, desc(DepDelay))
# rewrite more simply with the `top_n` function
flights %>%
group_by(UniqueCarrier) %>%
select(Month, DayofMonth, DepDelay) %>%
top_n(2,DepDelay) %>%
arrange(UniqueCarrier, desc(DepDelay)) # for each month, calculate the number of flights and the change from the previous month
flights %>%
group_by(Month) %>%
summarise(flight_count = n()) %>%
mutate(change = flight_count - lag(flight_count)) # rewrite more simply with the `tally` function
flights %>%
group_by(Month) %>%
tally() %>%
mutate(change = n - lag(n))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25

Other functions

# randomly sample a fixed number of rows, without replacement
flights %>% sample_n(5) # randomly sample a fraction of rows, with replacement
flights %>% sample_frac(0.25, replace=TRUE) # base R approach to view the structure of an object
str(flights) # dplyr approach: better formatting, and adapts to your screen width
glimpse(flights)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12

Connecting Databases

  • dplyr can connect to a database as if the data was loaded into a data frame
  • Use the same syntax for local data frames and databases
  • Only generates SELECT statements
  • Currently supports SQLite, PostgreSQL/Redshift, MySQL/MariaDB, BigQuery, MonetDB
  • Example below is based upon an SQLite database containing the hflights data
  • Instructions for creating this database are in the databases vignette
# connect to an SQLite database containing the hflights data
my_db <- src_sqlite("my_db.sqlite3") # connect to the "hflights" table in that database
flights_tbl <- tbl(my_db, "hflights") # example query with our data frame
flights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(desc(DepDelay)) # identical query using the database
flights_tbl %>%
select(UniqueCarrier, DepDelay) %>%
arrange(desc(DepDelay))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15

You can write the SQL commands yourself 
dplyr can tell you the SQL it plans to run and the query execution plan

# send SQL commands to the database
tbl(my_db, sql("SELECT * FROM hflights LIMIT 100")) # ask dplyr for the SQL commands
flights_tbl %>%
select(UniqueCarrier, DepDelay) %>%
arrange(desc(DepDelay)) %>%
explain()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8

参考资料

  1. justmarkham的教程1
  2. justmarkdown的教程2

R语言包_dplyr_1的更多相关文章

  1. R语言包在linux上的安装等知识

    有关install.packages()函数的详见:R包 package 的安装(install.packages函数详解) R的包(package)通常有两种:1 binary package:这种 ...

  2. R语言 包

    R语言包 R语言的包是R函数,编译代码和样本数据的集合. 它们存储在R语言环境中名为"library"的目录下. 默认情况下,R语言在安装期间安装一组软件包. 随后添加更多包,当它 ...

  3. R语言——包的添加和使用

    R是开源的软件工具,很多R语言用户和爱好者都会扩展R的功能模块,我们把这些模块称为包.我们可以通过下载安装这些已经写好的包来完成我们需要的任务工作. 包下载地址:https://cran.r-proj ...

  4. R语言包的安装

    pheatmap包的安装 1: 首先R语言的安装路径里面最好不要有中文路径 2: 在安装其他依存的scales和colorspace包时候要关闭防火墙 错误提示: 试开URL'https://mirr ...

  5. Windows下使用Rtools编译R语言包

    使用devtools安装github中的R源代码时,经常会出各种错误,索性搜了一下怎么在Windows下直接打包,网上的资料也是参差不齐,以下是自己验证通过的. 一.下载Rtools 下载地址:htt ...

  6. r语言 包说明

    [在实际工作中,每个数据科学项目各不相同,但基本都遵循一定的通用流程.具体如下]   [下面列出每个步骤最有用的一些R包] 1.数据导入以下R包主要用于数据导入和保存数据:feather:一种快速,轻 ...

  7. R语言包相关命令

    R的包(package)通常有两种:1 binary package:这种包属于即得即用型(ready-to-use),但是依赖与平台,即Win和Linux平台下不同.2 Source package ...

  8. R语言包翻译

    Shiny-cheatsheet 作者:周彦通 1.安装 install.packages("shinydashboard")  2.基础知识 仪表盘有三个部分:标题.侧边栏,身体 ...

  9. R语言包翻译——翻译

    Shiny-cheatsheet                                                                                     ...

随机推荐

  1. scp拷贝文件

    有了亚马逊的ec2后,物美价廉,但是,亚马逊的aws使用密钥登陆的,命令和密码登录有一点不同.记录. 1.有密钥登陆,首先要把密钥文件 xxx.pem 的权限设为700,否则会报错. scp -i x ...

  2. Atitti cto的日常流程与职责attilax总结

    Atitti cto的日常流程与职责attilax总结 1. (最重要)技术战略制定 规划,预测趋势1 1.1. 关键词 Throught技术雷达 趋势 没落  已死  辉煌 未来1 1.2. (比较 ...

  3. FFmpeg(9)-解码器解码代码演示(FFmpeg调用MediaCodec实现硬解码、多线程解码、及音视频解码性能测试)

    一.AVFrame 用来存放解码后的数据. [相关函数] AVFrame *frame = av_frame_alloc();                       // 空间分配,分配一个空间 ...

  4. 解决myeclipse/eclipse创建或导入maven工程时引发的问题

    起因: 最近学习maven,按照教程把命令行创建的maven工程导入到eclipse/myeclipse,由于库中没有一些依赖包,所以在导入工程的时候开发工具自动下载依赖包.可是,由于天朝特殊环境的问 ...

  5. Eclipse 配置Maven以及修改默认Repository

    今天将Eclipse关于Maven的配置总结一下,方便以后配置. 一.配置Maven环境 1.下载apache-maven文件,选择自己需要的版本,地址:http://mirrors.cnnic.cn ...

  6. 每日英语:5 Things to Know About Missing Malaysia Airlines Flight and Air Safety

    Malaysia Airlines Flight MH370, with 239 people aboard, lost contact early Saturday with the airline ...

  7. 每日英语:Nanjing's New Sifang Art Museum Illustrates China's Cultural Boom

    In a forest on the outskirts of this former Chinese capital, 58-year-old real-estate developer Lu Ju ...

  8. 对于iOS开发人工智能意味着什么

    对于iOS开发人工智能意味着什么? 前言 近几年来人工智能的话题那是炙手可热.在国内很多大佬言必谈机器学习和大数据:在美国刚毕业的人工智能 PHD 也是众人追捧,工资直逼 NFL 四分卫.人工智能甚至 ...

  9. openfire聊天记录插件

    package com.sqj.openfire.chat.logs; import java.io.File; import java.util.Date; import java.util.Lis ...

  10. Makefile常用万能模板(包括静态链接库、动态链接库、可执行文件)

    本文把makefile 分成了三份:生成可执行文件的makefile,生成静态链接库的makefile,生成动态链接库的makefile. 这些makefile都很简单,一般都是一看就会用,用法也很容 ...