census = read.csv("census.csv")
library(caTools)
set.seed(2000)
spl = sample.split(census$over50k,SplitRatio = 0.6)
train = subset(census,spl == TRUE)
test = subset(census, spl == FALSE)
# use the logistic regression
glm = glm(over50k ~. , data = train, family = "binomial")
summary(glm) #pr(>|z|) if it is smaller than 0.1, the variables are significant

#accuracy
glm.pred = predict(glm, newdata = test, type = "response")
table(test$over50k,glm.pred >= 0.5)

(9051+1888)/nrow(test)

#baseline accuracy of test - more frequent outcome
table(test$over50k)
9713/nrow(test)

#ROC & ACU
library(ROCR)
#Then we can generate the confusion matrix
ROCpred = prediction(glm.pred, test$over50k)
plot(performance(ROCpred,measure="tpr",x.measure="fpr"),colorize = TRUE)
as.numeric(performance(ROCpred, "auc")@y.values)

#Problem 2.1 - A CART Model
library(rpart)
library(rpart.plot)
CTree = rpart(over50k ~. , data = train, method = "class")
prp(CTree)

# accuracy of the CART model
CTree.pred = predict(CTree, newdata = test, type = "class")

table(test$over50k,CTree.pred)
(9243+1596)/nrow(test)

#use another way- generate probabilities and use a threshold of 0.5 like in logistic regression
CTree.pred1 = predict(CTree, newdata = test)
p = CTree.pred1[,2] # the column of over 50k
table(test$over50k, p) # p<=0.5 it is same with the <=50k, p>0.5 means >50k

# ROC curve for the CART model - WOW
#removing the type="class" argument when making predictions
library(ROCR)
library(arulesViz)
CTree.ROCpred = prediction(CTree.pred1[,2],test$over50k)
# plot(CTree.ROCpred) can not run
plot(performance(CTree.ROCpred,measure="tpr",x.measure="fpr"),colorize = TRUE)

# to caculate the auc
as.numeric(performance(CTree.ROCpred,"auc")@y.values)

# another way to seek for auc
CTree.ROCpred2 = prediction(p,test$over50k)
as.numeric(performance(CTree.ROCpred2,"auc")@y.values)

#Problem 3.1 - A Random Forest Model
set.seed(1)
trainSmall = train[sample(nrow(train),2000),]

set.seed(1)
library(randomForest)
RFC = randomForest(over50k ~., data = trainSmall)
RFC.pred = predict(RFC,newdata = test) #using a threshold of 0.5, no need to set the type = "class"
table(test$over50k,RFC.pred)
(9586+1093)/nrow(test) # a little difference is allowed

#compute metrics that give us insight into which variables are important.
vu = varUsed(RFC, count = TRUE)
vusorted = sort(vu, decreasing = FALSE, index.return = TRUE)
dotchart(vnsorted$x, names(RFC$forest$xlevel[vusorted$ix]))

#another way to find the important variables - impurity
varImpPlot(RFC)

# select cp by Cross-validation for the CART Trees
library(caret)
library(e1071)
set.seed(2)
#Specify that we are going to use k-fold cross validation with 10 folds:
numFolds = trainControl(method = "cv", number = 10)
#Specify the grid of cp values that we wish to evaluate:
cartGrid = expand.grid(.cp = seq(0.002,0.1,0.002))
#run the train function and view the result:
tr = train(over50k ~.,data = train, method = "rpart", trControl = numFolds, tuneGrid = cartGrid)
tr # The final value used for the model was cp = 0.002.

CTree2 = rpart(over50k ~., data = train, method = "class", cp = 0.002)
CTree2.pred = predict(CTree2, newdata = test, type = "class")
table(test$over50k, CTree2.pred)
(9178+1838)/nrow(test)
prp(CTree2) # shoould be 18 splits

[Machine Learning][The Analytics Edge][Predicting Earnings from Census Data]的更多相关文章

  1. Machine Learning for Developers

    Machine Learning for Developers Most developers these days have heard of machine learning, but when ...

  2. How do I learn machine learning?

    https://www.quora.com/How-do-I-learn-machine-learning-1?redirected_qid=6578644   How Can I Learn X? ...

  3. Course Machine Learning Note

    Machine Learning Note Introduction Introduction What is Machine Learning? Two definitions of Machine ...

  4. [C2P3] Andrew Ng - Machine Learning

    ##Advice for Applying Machine Learning Applying machine learning in practice is not always straightf ...

  5. Why The Golden Age Of Machine Learning is Just Beginning

    Why The Golden Age Of Machine Learning is Just Beginning Even though the buzz around neural networks ...

  6. Introducing: Machine Learning in R(转)

    Machine learning is a branch in computer science that studies the design of algorithms that can lear ...

  7. Azure Machine Learning

    About me In my spare time, I love learning new technologies and going to hackathons. Our hackathon p ...

  8. Getting started with machine learning in Python

    Getting started with machine learning in Python Machine learning is a field that uses algorithms to ...

  9. Google's Machine Learning Crash Course #01# Introducing ML & Framing & Fundamental terminology

    INDEX Introducing ML Framing Fundamental machine learning terminology Introducing ML What you learn ...

随机推荐

  1. 前端-JavaScript1-2——JavaScript建立认知

    关于首篇的“ Hello world ! ”这事儿吧,挺有意思,就是学习任何的语言,我们都喜欢在屏幕上直接输出一点什么,当做最简单.最基本的案例.输出什么大家随意,但是很多人都习惯输出“hello w ...

  2. Java构造器练习题

    仔细阅读下面的程序 public class Car { String name = "汽车"; public Car(String name) { this.name = nam ...

  3. 涂抹mysql笔记-管理mysql库和表

    mysql的表对象是基于库维护的,也就是说它属于某个库,不管对象是由谁创建的,只要库在表就在.这根Oracle不同Oracle中的表对象是基于用户的.属于创建改对象的用户所有,用户在表就在.mysql ...

  4. 七、XHTML介绍

    XHTML简介 1.什么是XHTML? XHTML指的是可扩展超文本标记语言 XHTML与HTML4.01几乎是相同的 XHTML是更严格更纯净的HTML版本 XHTML得到所有主流浏览器的支持 2. ...

  5. 注解_Annotation

    ---恢复内容开始--- 一.什么是注解 注解,英文Annotation,它不是程序本身,是对程序的解释,在这里我会想到为什么不能使用注释呢,因为注解是关于程序对信息的处理的流程的一些说明,而且格式也 ...

  6. leetcode207

    拓扑排序问题. class Solution { public: bool canFinish(int numCourses, vector<pair<int, int>>&a ...

  7. docker报错

    用docker搭建环境时可能会遇到错误:No releases available for package "xxxx" No releases available for pac ...

  8. ODPS SQL <for 数据操作语言DML>

    基本操作: 查询: SELECT [ALL | DISTINCT] select_expr, select_expr, ... FROM table_reference [WHERE where_co ...

  9. LevelDB源码分析-MemTable

    MemTable(db/memtable.h db/memtable.cc db/skiplist.h) LevelDB中存储在内存中的那部分KV数据都存储在memtable中,而memtable中的 ...

  10. sample function

    #coding:utf8 import requests import json import ssl import datetime import urllib import sys import ...