R语言学习笔记:SQL操作
虽然R很强大,但如果对SQL非常熟悉,也不能浪费这项技能了,可以用上sqldf包,从example("sqldf")抄了几条用法放在这里,以后可能会用上。
library("tcltk")
a1r <- head(warpbreaks)
a1s <- sqldf("select * from warpbreaks limit 6")
a2r <- subset(CO2, grepl("^Qn", Plant))
a2s <- sqldf("select * from CO2 where Plant like 'Qn%'")
data(farms, package = "MASS")
a3r <- subset(farms, Manag %in% c("BF", "HF"))
row.names(a3r) <- NULL
a3s <- sqldf("select * from farms where Manag in ('BF', 'HF')")
a4r <- subset(warpbreaks, breaks >= 20 & breaks <= 30)
a4s <- sqldf("select * from warpbreaks where breaks between 20 and 30", row.names = TRUE)
a5r <- subset(farms, Mois == 'M1')
a5s <- sqldf("select * from farms where Mois = 'M1'", row.names = TRUE)
a6r <- subset(farms, Mois == 'M2')
a6s <- sqldf("select * from farms where Mois = 'M2'", row.names = TRUE)
a7r <- rbind(a5r, a6r)
a7s <- sqldf("select * from a5s union all select * from a6s")
row.names(a7r) <- NULL
其它例子暂时用不到,就不试了,把example(sqldf)的输出记录在这里。
sqldf> # aggregate - avg conc and uptake by Plant and Type
sqldf> a8r <- aggregate(iris[1:2], iris[5], mean)
sqldf> a8s <- sqldf('select Species, avg("Sepal.Length") `Sepal.Length`,
sqldf+ avg("Sepal.Width") `Sepal.Width` from iris group by Species')
sqldf> all.equal(a8r, a8s)
[1] TRUE
sqldf> # by - avg conc and total uptake by Plant and Type
sqldf> a9r <- do.call(rbind, by(iris, iris[5], function(x) with(x,
sqldf+ data.frame(Species = Species[1],
sqldf+ mean.Sepal.Length = mean(Sepal.Length),
sqldf+ mean.Sepal.Width = mean(Sepal.Width),
sqldf+ mean.Sepal.ratio = mean(Sepal.Length/Sepal.Width)))))
sqldf> row.names(a9r) <- NULL
sqldf> a9s <- sqldf('select Species, avg("Sepal.Length") `mean.Sepal.Length`,
sqldf+ avg("Sepal.Width") `mean.Sepal.Width`,
sqldf+ avg("Sepal.Length"/"Sepal.Width") `mean.Sepal.ratio` from iris
sqldf+ group by Species')
sqldf> all.equal(a9r, a9s)
[1] TRUE
sqldf> # head - top 3 breaks
sqldf> a10r <- head(warpbreaks[order(warpbreaks$breaks, decreasing = TRUE), ], 3)
sqldf> a10s <- sqldf("select * from warpbreaks order by breaks desc limit 3")
sqldf> row.names(a10r) <- NULL
sqldf> identical(a10r, a10s)
[1] TRUE
sqldf> # head - bottom 3 breaks
sqldf> a11r <- head(warpbreaks[order(warpbreaks$breaks), ], 3)
sqldf> a11s <- sqldf("select * from warpbreaks order by breaks limit 3")
sqldf> # attributes(a11r) <- attributes(a11s) <- NULL
sqldf> row.names(a11r) <- NULL
sqldf> identical(a11r, a11s)
[1] TRUE
sqldf> # ave - rows for which v exceeds its group average where g is group
sqldf> DF <- data.frame(g = rep(1:2, each = 5), t = rep(1:5, 2), v = 1:10)
sqldf> a12r <- subset(DF, v > ave(v, g, FUN = mean))
sqldf> Gavg <- sqldf("select g, avg(v) as avg_v from DF group by g")
sqldf> a12s <- sqldf("select DF.g, t, v from DF, Gavg where DF.g = Gavg.g and v > avg_v")
sqldf> row.names(a12r) <- NULL
sqldf> identical(a12r, a12s)
[1] TRUE
sqldf> # same but reduce the two select statements to one using a subquery
sqldf> a13s <- sqldf("select g, t, v
sqldf+ from DF d1, (select g as g2, avg(v) as avg_v from DF group by g)
sqldf+ where d1.g = g2 and v > avg_v")
sqldf> identical(a12r, a13s)
[1] TRUE
sqldf> # same but shorten using natural join
sqldf> a14s <- sqldf("select g, t, v
sqldf+ from DF
sqldf+ natural join (select g, avg(v) as avg_v from DF group by g)
sqldf+ where v > avg_v")
sqldf> identical(a12r, a14s)
[1] TRUE
sqldf> # table
sqldf> a15r <- table(warpbreaks$tension, warpbreaks$wool)
sqldf> a15s <- sqldf("select sum(wool = 'A'), sum(wool = 'B')
sqldf+ from warpbreaks group by tension")
sqldf> all.equal(as.data.frame.matrix(a15r), a15s, check.attributes = FALSE)
[1] TRUE
sqldf> # reshape
sqldf> t.names <- paste("t", unique(as.character(DF$t)), sep = "_")
sqldf> a16r <- reshape(DF, direction = "wide", timevar = "t", idvar = "g", varying = list(t.names))
sqldf> a16s <- sqldf("select
sqldf+ g,
sqldf+ sum((t == 1) * v) t_1,
sqldf+ sum((t == 2) * v) t_2,
sqldf+ sum((t == 3) * v) t_3,
sqldf+ sum((t == 4) * v) t_4,
sqldf+ sum((t == 5) * v) t_5
sqldf+ from DF group by g")
sqldf> all.equal(a16r, a16s, check.attributes = FALSE)
[1] TRUE
sqldf> # order
sqldf> a17r <- Formaldehyde[order(Formaldehyde$optden, decreasing = TRUE), ]
sqldf> a17s <- sqldf("select * from Formaldehyde order by optden desc")
sqldf> row.names(a17r) <- NULL
sqldf> identical(a17r, a17s)
[1] TRUE
sqldf> # centered moving average of length 7
sqldf> set.seed(1)
sqldf> DF <- data.frame(x = rnorm(15, 1:15))
sqldf> s18 <- sqldf("select a.x x, avg(b.x) movavgx from DF a, DF b
sqldf+ where a.row_names - b.row_names between -3 and 3
sqldf+ group by a.row_names having count(*) = 7
sqldf+ order by a.row_names+0",
sqldf+ row.names = TRUE)
sqldf> r18 <- data.frame(x = DF[4:12,], movavgx = rowMeans(embed(DF$x, 7)))
sqldf> row.names(r18) <- NULL
sqldf> all.equal(r18, s18)
[1] TRUE
sqldf> # merge. a19r and a19s are same except row order and row names
sqldf> A <- data.frame(a1 = c(1, 2, 1), a2 = c(2, 3, 3), a3 = c(3, 1, 2))
sqldf> B <- data.frame(b1 = 1:2, b2 = 2:1)
sqldf> a19s <- sqldf("select * from A, B")
sqldf> a19r <- merge(A, B)
sqldf> Sort <- function(DF) DF[do.call(order, DF),]
sqldf> all.equal(Sort(a19s), Sort(a19r), check.attributes = FALSE)
[1] TRUE
sqldf> # within Date, of the highest quality records list the one closest
sqldf> # to noon. Note use of two sql statements in one call to sqldf.
sqldf>
sqldf> Lines <- "DeployID Date.Time LocationQuality Latitude Longitude
sqldf+ STM05-1 2005/02/28 17:35 Good -35.562 177.158
sqldf+ STM05-1 2005/02/28 19:44 Good -35.487 177.129
sqldf+ STM05-1 2005/02/28 23:01 Unknown -35.399 177.064
sqldf+ STM05-1 2005/03/01 07:28 Unknown -34.978 177.268
sqldf+ STM05-1 2005/03/01 18:06 Poor -34.799 177.027
sqldf+ STM05-1 2005/03/01 18:47 Poor -34.85 177.059
sqldf+ STM05-2 2005/02/28 12:49 Good -35.928 177.328
sqldf+ STM05-2 2005/02/28 21:23 Poor -35.926 177.314
sqldf+ "
sqldf> DF <- read.table(textConnection(Lines), skip = 1, as.is = TRUE,
sqldf+ col.names = c("Id", "Date", "Time", "Quality", "Lat", "Long"))
sqldf> sqldf(c("create temp table DFo as select * from DF order by
sqldf+ Date DESC, Quality DESC,
sqldf+ abs(substr(Time, 1, 2) + substr(Time, 4, 2) /60 - 12) DESC",
sqldf+ "select * from DFo group by Date"))
Id Date Time Quality Lat Long
1 STM05-2 2005/02/28 12:49 Good -35.928 177.328
2 STM05-1 2005/03/01 18:47 Poor -34.850 177.059
sqldf> ## Not run:
sqldf> ##D
sqldf> ##D # test of file connections with sqldf
sqldf> ##D
sqldf> ##D # create test .csv file of just 3 records
sqldf> ##D write.table(head(iris, 3), "iris3.dat", sep = ",", quote = FALSE)
sqldf> ##D
sqldf> ##D # look at contents of iris3.dat
sqldf> ##D readLines("iris3.dat")
sqldf> ##D
sqldf> ##D # set up file connection
sqldf> ##D iris3 <- file("iris3.dat")
sqldf> ##D sqldf('select * from iris3 where "Sepal.Width" > 3')
sqldf> ##D
sqldf> ##D # using a non-default separator
sqldf> ##D # file.format can be an attribute of file object or an arg passed to sqldf
sqldf> ##D write.table(head(iris, 3), "iris3.dat", sep = ";", quote = FALSE)
sqldf> ##D iris3 <- file("iris3.dat")
sqldf> ##D sqldf('select * from iris3 where "Sepal.Width" > 3', file.format = list(sep = ";"))
sqldf> ##D
sqldf> ##D # same but pass file.format through attribute of file object
sqldf> ##D attr(iris3, "file.format") <- list(sep = ";")
sqldf> ##D sqldf('select * from iris3 where "Sepal.Width" > 3')
sqldf> ##D
sqldf> ##D # copy file straight to disk without going through R
sqldf> ##D # and then retrieve portion into R
sqldf> ##D sqldf('select * from iris3 where "Sepal.Width" > 3', dbname = tempfile())
sqldf> ##D
sqldf> ##D ### same as previous example except it allows multiple queries against
sqldf> ##D ### the database. We use iris3 from before. This time we use an
sqldf> ##D ### in memory SQLite database.
sqldf> ##D
sqldf> ##D sqldf() # open a connection
sqldf> ##D sqldf('select * from iris3 where "Sepal.Width" > 3')
sqldf> ##D
sqldf> ##D # At this point we have an iris3 variable in both
sqldf> ##D # the R workspace and in the SQLite database so we need to
sqldf> ##D # explicitly let it know we want the version in the database.
sqldf> ##D # If we were not to do that it would try to use the R version
sqldf> ##D # by default and fail since sqldf would prevent it from
sqldf> ##D # overwriting the version already in the database to protect
sqldf> ##D # the user from inadvertent errors.
sqldf> ##D sqldf('select * from main.iris3 where "Sepal.Width" > 4')
sqldf> ##D sqldf('select * from main.iris3 where "Sepal_Width" < 4')
sqldf> ##D sqldf() # close connection
sqldf> ##D
sqldf> ##D ### another way to do this is a mix of sqldf and RSQLite statements
sqldf> ##D ### In that case we need to fetch the connection for use with RSQLite
sqldf> ##D ### and do not have to specifically refer to main since RSQLite can
sqldf> ##D ### only access the database.
sqldf> ##D
sqldf> ##D con <- sqldf()
sqldf> ##D # this iris3 refers to the R variable and file
sqldf> ##D sqldf('select * from iris3 where "Sepal.Width" > 3')
sqldf> ##D sqldf("select count(*) from iris3")
sqldf> ##D # these iris3 refer to the database table
sqldf> ##D dbGetQuery(con, 'select * from iris3 where "Sepal.Width" > 4')
sqldf> ##D dbGetQuery(con, 'select * from iris3 where "Sepal.Width" < 4')
sqldf> ##D sqldf()
sqldf> ##D
sqldf> ## End(Not run)
R语言学习笔记:SQL操作的更多相关文章
- R语言学习笔记之: 论如何正确把EXCEL文件喂给R处理
博客总目录:http://www.cnblogs.com/weibaar/p/4507801.html ---- 前言: 应用背景兼吐槽 继续延续之前每个月至少一次更新博客,归纳总结学习心得好习惯. ...
- R语言学习笔记:基础知识
1.数据分析金字塔 2.[文件]-[改变工作目录] 3.[程序包]-[设定CRAN镜像] [程序包]-[安装程序包] 4.向量 c() 例:x=c(2,5,8,3,5,9) 例:x=c(1:100) ...
- R语言学习笔记(二)
今天主要学习了两个统计学的基本概念:峰度和偏度,并且用R语言来描述. > vars<-c("mpg","hp","wt") &g ...
- R语言学习笔记:字符串处理
想在R语言中生成一个图形文件的文件名,前缀是fitbit,后面跟上月份,再加上".jpg",先不百度,试了试其它语言的类似语法,没一个可行的: C#中:"fitbit&q ...
- R语言学习笔记:向量
向量是R语言最基本的数据类型. 单个数值(标量)其实没有单独的数据类型,它只不过是只有一个元素的向量. x <- c(1, 2, 4, 9) x <- c(x[1:3], 88, x[4] ...
- R语言学习笔记:小试R环境
买了三本R语言的书,同时使用来学习R语言,粗略翻下来感觉第一本最好: <R语言编程艺术>The Art of R Programming <R语言初学者使用>A Beginne ...
- R语言学习笔记:向量化
R语言最强大的方面之一就是函数的向量化,这些函数可以直接对向量的每个元素进行操作.例如: 对每个元素进行开方 > v<-c(4,3,8,16,7.3) > v [1] 4.0 3 ...
- R语言学习笔记:使用reshape2包实现整合与重构
R语言中提供了许多用来整合和重塑数据的强大方法. 整合 aggregate 重塑 reshape 在整合数据时,往往将多组观测值替换为根据这些观测计算的描述统计量. 在重塑数据时,则会通过修改数据的结 ...
- R语言学习笔记1——R语言中的基本对象
R语言,一种自由软件编程语言与操作环境,主要用于统计分析.绘图.数据挖掘.R本来是由来自新西兰奥克兰大学的Ross Ihaka和Robert Gentleman开发(也因此称为R),现在由“R开发核心 ...
- R语言学习笔记——C#中如何使用R语言setwd()函数
在R语言编译器中,设置当前工作文件夹可以用setwd()函数. > setwd("e://桌面//")> setwd("e:\桌面\")> s ...
随机推荐
- [Linux] 查看系统启动时间
查找系统最后启动时间 1. 使用 who 命令 who -b 输出: system boot 2015-10-14 00:51 2. 使用 last 命令 last reboot | head -1 ...
- .Net底层剖析目录章节
[.Net底层剖析]目录章节 1.[深入浅出.Net IL]1.一个For循环引发的IL 2.[.Net底层剖析]2.stfld指令-给对象的字段赋值 3.[.Net底层剖析]3.用IL来理解属性 作 ...
- excel导入记录
use DangJianSELECT vale1, value2 into Table2 from Table1 select COUNT(*) from tmpdangyuan where 手机号 ...
- 使用autotools系列工具自动部署源代码编译安装
在Linux系统下开发一个较大的项目,完全手动建立Makefile是一件费力而又容易出错的工作.autotools系列工具只需用户输入简单的目标文件.依赖文件.文件目录等就可以比较轻松地生成Makef ...
- JS魔法堂:doctype我们应该了解的基础知识
一.前言 什么是doctype?其实我们一直使用,却很少停下来看清楚它到底是什么,对网页有什么作用.本篇将和大家一起探讨那个默默无闻的doctype吧! 二.什么是doctype doctype或DT ...
- 一、HTTPServer,RequestHandler,ServerHandler,Handler
1. HTTPServer,RequestHandler,ServerHandler,Handler 1.1 基本概念 HTTPServer主要是对传输控制层HTTP,TCP,S ...
- 0525Sprint回顾
1.回顾组织 主题:“我们下次怎么样才能更加认真对待?” 时间:设定为1至2个小时. 参与者:整个团队. 场所:能够在不受干扰的情况下讨论. 秘书:指定某人当秘书,筹备.记录.整理. 2.回顾流程 ...
- Math APP 2.0
首先,我们把这个软件理解成一个投入市场的.帮助小朋友进行算术运算练习的APP. 从质量保证的角度,有哪些需要改进的BUG? 从用户的角度(把自己当成小学生或真的请小学生帮忙),需要在哪些方面进行改进? ...
- Mybatis 中在传参时,${} 和#{} 的区别
介绍 MyBatis中使用parameterType向SQL语句传参,parameterType后的类型可以是基本类型int,String,HashMap和java自定义类型. 在SQL中引用这些参数 ...
- TinyOS和Deluge的安装模拟(一)
介绍 TinyOS是一款嵌入式操作系统,相信做无线传感器网络开发的同志们都不陌生.同类型的系统有不少,但是TinyOS的应用较之其他系统更为广泛.TinyOS 1.x版本和2.x版本是目前主要的两个分 ...