#-----------------------------------------------------------------------------#
# R in Action (2nd ed): Chapter 17 #
# Classification #
# requires packaged rpart, party, randomForest, kernlab, rattle #
# install.packages(c("rpart", "party", "randomForest", "e1071", "rpart.plot") #
# install.packages(rattle, dependencies = c("Depends", "Suggests")) #
#-----------------------------------------------------------------------------# par(ask=TRUE) # Listing 17.1 - Prepare the breast cancer data
loc <- "http://archive.ics.uci.edu/ml/machine-learning-databases/"
ds <- "breast-cancer-wisconsin/breast-cancer-wisconsin.data"
url <- paste(loc, ds, sep="") breast <- read.table(url, sep=",", header=FALSE, na.strings="?")
names(breast) <- c("ID", "clumpThickness", "sizeUniformity",
"shapeUniformity", "maginalAdhesion",
"singleEpithelialCellSize", "bareNuclei",
"blandChromatin", "normalNucleoli", "mitosis", "class") df <- breast[-1]
df$class <- factor(df$class, levels=c(2,4),
labels=c("benign", "malignant")) set.seed(1234)
train <- sample(nrow(df), 0.7*nrow(df))
df.train <- df[train,]
df.validate <- df[-train,]
table(df.train$class)
table(df.validate$class) # Listing 17.2 - Logistic regression with glm()
fit.logit <- glm(class~., data=df.train, family=binomial())
summary(fit.logit)
prob <- predict(fit.logit, df.validate, type="response")
logit.pred <- factor(prob > .5, levels=c(FALSE, TRUE),
labels=c("benign", "malignant"))
logit.perf <- table(df.validate$class, logit.pred,
dnn=c("Actual", "Predicted"))
logit.perf # Listing 17.3 - Creating a classical decision tree with rpart()
library(rpart)
set.seed(1234)
dtree <- rpart(class ~ ., data=df.train, method="class",
parms=list(split="information"))
dtree$cptable
plotcp(dtree) dtree.pruned <- prune(dtree, cp=.0125) library(rpart.plot)
prp(dtree.pruned, type = 2, extra = 104,
fallen.leaves = TRUE, main="Decision Tree") dtree.pred <- predict(dtree.pruned, df.validate, type="class")
dtree.perf <- table(df.validate$class, dtree.pred,
dnn=c("Actual", "Predicted"))
dtree.perf # Listing 17.4 - Creating a conditional inference tree with ctree()
library(party)
fit.ctree <- ctree(class~., data=df.train)
plot(fit.ctree, main="Conditional Inference Tree") ctree.pred <- predict(fit.ctree, df.validate, type="response")
ctree.perf <- table(df.validate$class, ctree.pred,
dnn=c("Actual", "Predicted"))
ctree.perf # Listing 17.5 - Random forest
library(randomForest)
set.seed(1234)
fit.forest <- randomForest(class~., data=df.train,
na.action=na.roughfix,
importance=TRUE)
fit.forest
importance(fit.forest, type=2)
forest.pred <- predict(fit.forest, df.validate)
forest.perf <- table(df.validate$class, forest.pred,
dnn=c("Actual", "Predicted"))
forest.perf # Listing 17.6 - A support vector machine
library(e1071)
set.seed(1234)
fit.svm <- svm(class~., data=df.train)
fit.svm
svm.pred <- predict(fit.svm, na.omit(df.validate))
svm.perf <- table(na.omit(df.validate)$class,
svm.pred, dnn=c("Actual", "Predicted"))
svm.perf # Listing 17.7 Tuning an RBF support vector machine (this can take a while)
set.seed(1234)
tuned <- tune.svm(class~., data=df.train,
gamma=10^(-6:1),
cost=10^(-10:10))
tuned
fit.svm <- svm(class~., data=df.train, gamma=.01, cost=1)
svm.pred <- predict(fit.svm, na.omit(df.validate))
svm.perf <- table(na.omit(df.validate)$class,
svm.pred, dnn=c("Actual", "Predicted"))
svm.perf # Listing 17.8 Function for assessing binary classification accuracy
performance <- function(table, n=2){
if(!all(dim(table) == c(2,2)))
stop("Must be a 2 x 2 table")
tn = table[1,1]
fp = table[1,2]
fn = table[2,1]
tp = table[2,2]
sensitivity = tp/(tp+fn)
specificity = tn/(tn+fp)
ppp = tp/(tp+fp)
npp = tn/(tn+fn)
hitrate = (tp+tn)/(tp+tn+fp+fn)
result <- paste("Sensitivity = ", round(sensitivity, n) ,
"\nSpecificity = ", round(specificity, n),
"\nPositive Predictive Value = ", round(ppp, n),
"\nNegative Predictive Value = ", round(npp, n),
"\nAccuracy = ", round(hitrate, n), "\n", sep="")
cat(result)
} # Listing 17.9 - Performance of breast cancer data classifiers
performance(dtree.perf)
performance(ctree.perf)
performance(forest.perf)
performance(svm.perf) # Using Rattle Package for data mining loc <- "http://archive.ics.uci.edu/ml/machine-learning-databases/"
ds <- "pima-indians-diabetes/pima-indians-diabetes.data"
url <- paste(loc, ds, sep="")
diabetes <- read.table(url, sep=",", header=FALSE)
names(diabetes) <- c("npregant", "plasma", "bp", "triceps",
"insulin", "bmi", "pedigree", "age", "class")
diabetes$class <- factor(diabetes$class, levels=c(0,1),
labels=c("normal", "diabetic"))
library(rattle)
rattle()

吴裕雄--天生自然 R语言开发学习:分类(续二)的更多相关文章

  1. 吴裕雄--天生自然 R语言开发学习:R语言的安装与配置

    下载R语言和开发工具RStudio安装包 先安装R

  2. 吴裕雄--天生自然 R语言开发学习:数据集和数据结构

    数据集的概念 数据集通常是由数据构成的一个矩形数组,行表示观测,列表示变量.表2-1提供了一个假想的病例数据集. 不同的行业对于数据集的行和列叫法不同.统计学家称它们为观测(observation)和 ...

  3. 吴裕雄--天生自然 R语言开发学习:导入数据

    2.3.6 导入 SPSS 数据 IBM SPSS数据集可以通过foreign包中的函数read.spss()导入到R中,也可以使用Hmisc 包中的spss.get()函数.函数spss.get() ...

  4. 吴裕雄--天生自然 R语言开发学习:使用键盘、带分隔符的文本文件输入数据

    R可从键盘.文本文件.Microsoft Excel和Access.流行的统计软件.特殊格 式的文件.多种关系型数据库管理系统.专业数据库.网站和在线服务中导入数据. 使用键盘了.有两种常见的方式:用 ...

  5. 吴裕雄--天生自然 R语言开发学习:R语言的简单介绍和使用

    假设我们正在研究生理发育问 题,并收集了10名婴儿在出生后一年内的月龄和体重数据(见表1-).我们感兴趣的是体重的分 布及体重和月龄的关系. 可以使用函数c()以向量的形式输入月龄和体重数据,此函 数 ...

  6. 吴裕雄--天生自然 R语言开发学习:基础知识

    1.基础数据结构 1.1 向量 # 创建向量a a <- c(1,2,3) print(a) 1.2 矩阵 #创建矩阵 mymat <- matrix(c(1:10), nrow=2, n ...

  7. 吴裕雄--天生自然 R语言开发学习:图形初阶(续二)

    # ----------------------------------------------------# # R in Action (2nd ed): Chapter 3 # # Gettin ...

  8. 吴裕雄--天生自然 R语言开发学习:图形初阶(续一)

    # ----------------------------------------------------# # R in Action (2nd ed): Chapter 3 # # Gettin ...

  9. 吴裕雄--天生自然 R语言开发学习:图形初阶

    # ----------------------------------------------------# # R in Action (2nd ed): Chapter 3 # # Gettin ...

  10. 吴裕雄--天生自然 R语言开发学习:基本图形(续二)

    #---------------------------------------------------------------# # R in Action (2nd ed): Chapter 6 ...

随机推荐

  1. UML-领域模型-如何找到概念类

    有3个方法 方法1:对已有的重用和修改(这是最好的方法) 重用和修改现有模型.这些模型来源于之前的项目.网上的 方法2:使用分类列表 从网上搜索该领域的常见分类,或参考书籍Martin Fowler的 ...

  2. drf框架知识点总复习

    接口 """ 1.什么是接口:url+请求参数+响应数据 | 接口文档 2.接口规范: url:https,api,资源(名词复数), v1,get|post表示操作资源 ...

  3. mongodb 批量改变某一列类型 比如 String改为double,insert into select 批量插入 批量修改

    //type:2代表String 1.String变Double db.集合.find({"列":{$type:2}}).forEach(function(x){ x.列=pars ...

  4. Vue2--非父子组件通信笔记

    核心要点: var Event=new Vue(); Event.$emit(事件名称, 数据) Event.$on(事件名称,function(data){ //data }.bind(this)) ...

  5. vue路由的跳转-路由传参-cookies插件-axios插件-跨域问题-element-ui插件

    ---恢复内容开始--- 项目初始化 创建一个纯净的vue环境项目,手动书写全局的样式配置,全局的main,js配置 (1)如果vue项目在重构或者出错的时候,手动安装node_modules. 如果 ...

  6. vim的背景配置

    1.vim背景颜色的配置 在usr/share/vim/vim74/colors   目录下有可以配置的颜色方案,可以查看: 命令:vim/etc/vimrc 打开vim的配置文件,   在最后加入一 ...

  7. OAuth 2.0安全案例回顾

    转载自:http://www.360doc.com/content/14/0311/22/834950_359713295.shtml 0x00 背景 纵观账号互通发展史,可以发现OAuth比起其它协 ...

  8. 七种常见经典排序算法总结(C++实现)

    排序算法是非常常见也非常基础的算法,以至于大部分情况下它们都被集成到了语言的辅助库中.排序算法虽然已经可以很方便的使用,但是理解排序算法可以帮助我们找到解题的方向. 1. 冒泡排序 (Bubble S ...

  9. PAT甲级——1012 The Best Rank

    PATA1012 The Best Rank To evaluate the performance of our first year CS majored students, we conside ...

  10. CentOS6与CentOS7的网络区别

    回顾:物理层 关注的是接口物理特性,传输介质数据链路层 MAC地址,数据帧,以太网,交换机网络层 IP地址,数据包,IP\ICMP\ARP协议,路由器传输层 TCP.UDP,端口号,数据段应用层 HT ...