census = read.csv("census.csv")
library(caTools)
set.seed(2000)
spl = sample.split(census$over50k,SplitRatio = 0.6)
train = subset(census,spl == TRUE)
test = subset(census, spl == FALSE)
# use the logistic regression
glm = glm(over50k ~. , data = train, family = "binomial")
summary(glm) #pr(>|z|) if it is smaller than 0.1, the variables are significant

#accuracy
glm.pred = predict(glm, newdata = test, type = "response")
table(test$over50k,glm.pred >= 0.5)

(9051+1888)/nrow(test)

#baseline accuracy of test - more frequent outcome
table(test$over50k)
9713/nrow(test)

#ROC & ACU
library(ROCR)
#Then we can generate the confusion matrix
ROCpred = prediction(glm.pred, test$over50k)
plot(performance(ROCpred,measure="tpr",x.measure="fpr"),colorize = TRUE)
as.numeric(performance(ROCpred, "auc")@y.values)

#Problem 2.1 - A CART Model
library(rpart)
library(rpart.plot)
CTree = rpart(over50k ~. , data = train, method = "class")
prp(CTree)

# accuracy of the CART model
CTree.pred = predict(CTree, newdata = test, type = "class")

table(test$over50k,CTree.pred)
(9243+1596)/nrow(test)

#use another way- generate probabilities and use a threshold of 0.5 like in logistic regression
CTree.pred1 = predict(CTree, newdata = test)
p = CTree.pred1[,2] # the column of over 50k
table(test$over50k, p) # p<=0.5 it is same with the <=50k, p>0.5 means >50k

# ROC curve for the CART model - WOW
#removing the type="class" argument when making predictions
library(ROCR)
library(arulesViz)
CTree.ROCpred = prediction(CTree.pred1[,2],test$over50k)
# plot(CTree.ROCpred) can not run
plot(performance(CTree.ROCpred,measure="tpr",x.measure="fpr"),colorize = TRUE)

# to caculate the auc
as.numeric(performance(CTree.ROCpred,"auc")@y.values)

# another way to seek for auc
CTree.ROCpred2 = prediction(p,test$over50k)
as.numeric(performance(CTree.ROCpred2,"auc")@y.values)

#Problem 3.1 - A Random Forest Model
set.seed(1)
trainSmall = train[sample(nrow(train),2000),]

set.seed(1)
library(randomForest)
RFC = randomForest(over50k ~., data = trainSmall)
RFC.pred = predict(RFC,newdata = test) #using a threshold of 0.5, no need to set the type = "class"
table(test$over50k,RFC.pred)
(9586+1093)/nrow(test) # a little difference is allowed

#compute metrics that give us insight into which variables are important.
vu = varUsed(RFC, count = TRUE)
vusorted = sort(vu, decreasing = FALSE, index.return = TRUE)
dotchart(vnsorted$x, names(RFC$forest$xlevel[vusorted$ix]))

#another way to find the important variables - impurity
varImpPlot(RFC)

# select cp by Cross-validation for the CART Trees
library(caret)
library(e1071)
set.seed(2)
#Specify that we are going to use k-fold cross validation with 10 folds:
numFolds = trainControl(method = "cv", number = 10)
#Specify the grid of cp values that we wish to evaluate:
cartGrid = expand.grid(.cp = seq(0.002,0.1,0.002))
#run the train function and view the result:
tr = train(over50k ~.,data = train, method = "rpart", trControl = numFolds, tuneGrid = cartGrid)
tr # The final value used for the model was cp = 0.002.

CTree2 = rpart(over50k ~., data = train, method = "class", cp = 0.002)
CTree2.pred = predict(CTree2, newdata = test, type = "class")
table(test$over50k, CTree2.pred)
(9178+1838)/nrow(test)
prp(CTree2) # shoould be 18 splits

[Machine Learning][The Analytics Edge][Predicting Earnings from Census Data]的更多相关文章

  1. Machine Learning for Developers

    Machine Learning for Developers Most developers these days have heard of machine learning, but when ...

  2. How do I learn machine learning?

    https://www.quora.com/How-do-I-learn-machine-learning-1?redirected_qid=6578644   How Can I Learn X? ...

  3. Course Machine Learning Note

    Machine Learning Note Introduction Introduction What is Machine Learning? Two definitions of Machine ...

  4. [C2P3] Andrew Ng - Machine Learning

    ##Advice for Applying Machine Learning Applying machine learning in practice is not always straightf ...

  5. Why The Golden Age Of Machine Learning is Just Beginning

    Why The Golden Age Of Machine Learning is Just Beginning Even though the buzz around neural networks ...

  6. Introducing: Machine Learning in R(转)

    Machine learning is a branch in computer science that studies the design of algorithms that can lear ...

  7. Azure Machine Learning

    About me In my spare time, I love learning new technologies and going to hackathons. Our hackathon p ...

  8. Getting started with machine learning in Python

    Getting started with machine learning in Python Machine learning is a field that uses algorithms to ...

  9. Google's Machine Learning Crash Course #01# Introducing ML & Framing & Fundamental terminology

    INDEX Introducing ML Framing Fundamental machine learning terminology Introducing ML What you learn ...

随机推荐

  1. centos7如何查看网络状态?

    参考https://www.jb51.net/os/RedHat/520187.html 查看网络状态: lsof -Pnl +M -i4 显示ipv4服务及监听端情况 netstat -anp 所有 ...

  2. onOptionsItemSelected、onMenuItemSelected、onContextItemSelected 区别

         1.在点击选项菜单(OptionsMenu:点击menu弹出的菜单)的菜单项时即调用了onMenuItemSelected 也调用了onOptionsItemSelected ,于是疑惑他们 ...

  3. 廖雪峰Java7处理日期和时间-2Data和Calendar-1Date

    计算机中如何存储和表示日期和时间 Epoch Time:从1970年1月1日零点(格林威治时区/GMT+00:00)到现在经历的秒数,也叫timestamp, 例如: 秒级: * 北京 2016-11 ...

  4. 关于CoreData的用法

    有些同事觉得CoreData是一个看不懂,理解不清的神秘东东,其实ios的本地数据储存是一个sqlite数据库,一个简易的数据库,而这个CoreData是否支持所有储存的数据呢,显然不是的,站在我的角 ...

  5. 一个简单的通讯服务框架(大家发表意见一起研究)JAVA版本

    最近研究下java语言,根据一般使用的情况,写了个连接通讯服务的框架: 框架结构 C-Manager-S; 把所有通讯内容抽取成三个方法接口:GetData,SetData,带返还的Get; 所有数据 ...

  6. leetCode27.移除元素

    给定一个数组 nums 和一个值 val,你需要原地移除所有数值等于 val 的元素,返回移除后数组的新长度. 不要使用额外的数组空间,你必须在原地修改输入数组并在使用 O(1) 额外空间的条件下完成 ...

  7. 面向连接的tcp 编程

    from socket import * serverSocket=socket(AF_INET,SOCK_STREAM) serverSocket.bind(("",8899)) ...

  8. centos6.5部署redmine3.2

    ruby 2.1 + rails 4.2+ mysql 5.6 +centos6.5 + rvm 1.29 1.基本的软件环境 yum -y install libyaml-devel zlib-de ...

  9. list按照某个元素进行排序

    import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.u ...

  10. 在delphi中XLSReadWriteII.组件的应用实例(2)

    第三方组件:XLSReadWriteII.v.5.20.67_XE3 实例源码如下:   unit Unit1; interface uses Winapi.Windows, Winapi.Messa ...