census = read.csv("census.csv")
library(caTools)
set.seed(2000)
spl = sample.split(census$over50k,SplitRatio = 0.6)
train = subset(census,spl == TRUE)
test = subset(census, spl == FALSE)
# use the logistic regression
glm = glm(over50k ~. , data = train, family = "binomial")
summary(glm) #pr(>|z|) if it is smaller than 0.1, the variables are significant

#accuracy
glm.pred = predict(glm, newdata = test, type = "response")
table(test$over50k,glm.pred >= 0.5)

(9051+1888)/nrow(test)

#baseline accuracy of test - more frequent outcome
table(test$over50k)
9713/nrow(test)

#ROC & ACU
library(ROCR)
#Then we can generate the confusion matrix
ROCpred = prediction(glm.pred, test$over50k)
plot(performance(ROCpred,measure="tpr",x.measure="fpr"),colorize = TRUE)
as.numeric(performance(ROCpred, "auc")@y.values)

#Problem 2.1 - A CART Model
library(rpart)
library(rpart.plot)
CTree = rpart(over50k ~. , data = train, method = "class")
prp(CTree)

# accuracy of the CART model
CTree.pred = predict(CTree, newdata = test, type = "class")

table(test$over50k,CTree.pred)
(9243+1596)/nrow(test)

#use another way- generate probabilities and use a threshold of 0.5 like in logistic regression
CTree.pred1 = predict(CTree, newdata = test)
p = CTree.pred1[,2] # the column of over 50k
table(test$over50k, p) # p<=0.5 it is same with the <=50k, p>0.5 means >50k

# ROC curve for the CART model - WOW
#removing the type="class" argument when making predictions
library(ROCR)
library(arulesViz)
CTree.ROCpred = prediction(CTree.pred1[,2],test$over50k)
# plot(CTree.ROCpred) can not run
plot(performance(CTree.ROCpred,measure="tpr",x.measure="fpr"),colorize = TRUE)

# to caculate the auc
as.numeric(performance(CTree.ROCpred,"auc")@y.values)

# another way to seek for auc
CTree.ROCpred2 = prediction(p,test$over50k)
as.numeric(performance(CTree.ROCpred2,"auc")@y.values)

#Problem 3.1 - A Random Forest Model
set.seed(1)
trainSmall = train[sample(nrow(train),2000),]

set.seed(1)
library(randomForest)
RFC = randomForest(over50k ~., data = trainSmall)
RFC.pred = predict(RFC,newdata = test) #using a threshold of 0.5, no need to set the type = "class"
table(test$over50k,RFC.pred)
(9586+1093)/nrow(test) # a little difference is allowed

#compute metrics that give us insight into which variables are important.
vu = varUsed(RFC, count = TRUE)
vusorted = sort(vu, decreasing = FALSE, index.return = TRUE)
dotchart(vnsorted$x, names(RFC$forest$xlevel[vusorted$ix]))

#another way to find the important variables - impurity
varImpPlot(RFC)

# select cp by Cross-validation for the CART Trees
library(caret)
library(e1071)
set.seed(2)
#Specify that we are going to use k-fold cross validation with 10 folds:
numFolds = trainControl(method = "cv", number = 10)
#Specify the grid of cp values that we wish to evaluate:
cartGrid = expand.grid(.cp = seq(0.002,0.1,0.002))
#run the train function and view the result:
tr = train(over50k ~.,data = train, method = "rpart", trControl = numFolds, tuneGrid = cartGrid)
tr # The final value used for the model was cp = 0.002.

CTree2 = rpart(over50k ~., data = train, method = "class", cp = 0.002)
CTree2.pred = predict(CTree2, newdata = test, type = "class")
table(test$over50k, CTree2.pred)
(9178+1838)/nrow(test)
prp(CTree2) # shoould be 18 splits

[Machine Learning][The Analytics Edge][Predicting Earnings from Census Data]的更多相关文章

  1. Machine Learning for Developers

    Machine Learning for Developers Most developers these days have heard of machine learning, but when ...

  2. How do I learn machine learning?

    https://www.quora.com/How-do-I-learn-machine-learning-1?redirected_qid=6578644   How Can I Learn X? ...

  3. Course Machine Learning Note

    Machine Learning Note Introduction Introduction What is Machine Learning? Two definitions of Machine ...

  4. [C2P3] Andrew Ng - Machine Learning

    ##Advice for Applying Machine Learning Applying machine learning in practice is not always straightf ...

  5. Why The Golden Age Of Machine Learning is Just Beginning

    Why The Golden Age Of Machine Learning is Just Beginning Even though the buzz around neural networks ...

  6. Introducing: Machine Learning in R(转)

    Machine learning is a branch in computer science that studies the design of algorithms that can lear ...

  7. Azure Machine Learning

    About me In my spare time, I love learning new technologies and going to hackathons. Our hackathon p ...

  8. Getting started with machine learning in Python

    Getting started with machine learning in Python Machine learning is a field that uses algorithms to ...

  9. Google's Machine Learning Crash Course #01# Introducing ML & Framing & Fundamental terminology

    INDEX Introducing ML Framing Fundamental machine learning terminology Introducing ML What you learn ...

随机推荐

  1. IDEA—— 找不到或无法加载主类Main

    最近使用idea,编写了一个项目,发现老是找不到main,网上找了一大圈的解决方案,都不行.灵机一动升级了jdk就可以了,之前用的是1.7的,换成了1.8的就好了.

  2. aspose.cells 插入图片

    ,,"d:\\1.jpg"); Aspose.Cells.Drawing.Picture pic = worksheet.Pictures[iIndex]; pic.Placeme ...

  3. C语言中的作用域,链接属性和存储类型

    作用域 当变量在程序的某个部分被声明的时候,他只有在程序的一定渔区才能被访问,编译器可以确认4种不同类型的作用域:文件作用域,函数作用域,代码块作用域和原型作用域 1.代码块作用域:位于一对花括号之间 ...

  4. linux 配置vue环境

    系统 [root@Gao conf.d]# uname -a 工具 1.Final Shell 2.工具截图 需要下载的部分 node.js    npm   cnpm   vue-cli 安装nod ...

  5. pygame 简单播放音乐程序

    环境: python2.7 pygame 功能: 播放指定目录下的歌曲(暂时mp3),可以上一曲.下一曲播放. 文件目录: font  字体文件夹 image  图片文件夹 music  音乐文件夹 ...

  6. [java,2017-05-04] 创建word文档

    package test; import java.text.SimpleDateFormat; import java.util.Date; import com.aspose.words.Data ...

  7. java面试题收集

    http://www.cnblogs.com/yhason/archive/2012/06/07/2540743.html 2,java常见面试题 http://www.cnblogs.com/yha ...

  8. android 开发 ScrollView 控件的一些api描述与自定义ScrollView接口回调方法

    1.正常使用ScrollView控件的一些api详解. package com.example.lenovo.mydemoapp.scrollViewDemo; import android.supp ...

  9. spring 之 注入之 by name or by type, or both ?

    @Autowired 和  @Qualifier 使用xml 注入的时候, 我们可以指定 autowire=“byType” 或“byName” . 但是使用 注解的时候, @Autowired  只 ...

  10. mazing ASP.NET Core 2.0【转】

    前言 ASP.NET Core 的变化和发展速度是飞快的,当你发现你还没有掌握 ASP.NET Core 1.0 的时候, 2.0 已经快要发布了,目前 2.0 处于 Preview 1 版本,意味着 ...