Classification and Prediction

# coding: utf-8

# In[128]:

get_ipython().magic(u'matplotlib inline')

import pandas as pd

from pandas import Series,DataFrame

import seaborn as sns

sns.set_style('whitegrid')

pd.set_option('display.mpl_style', 'default')

import numpy as np

import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC, LinearSVC

from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import GaussianNB

train_df= pd.read_csv("/home/lpstudy/下载/train.csv")

test_df = pd.read_csv("/home/lpstudy/下载/test.csv")

train_df.head()  

test_df.head()

# In[129]:

train_df = train_df.drop(["Ticket","PassengerId","Name"],axis = 1)

test_df = test_df.drop(["Name","Ticket"],axis =1)

# In[130]:

train_df.head()

# In[131]:

train_df["Embarked"] = train_df["Embarked"].fillna("S")

#plot

sns.factorplot("Embarked","Survived",data = train_df,size = 6,aspect = 2)

fig,(axis1,axis2,axis3) = plt.subplots(1,3,figsize = (15,5))

sns.countplot(x='Embarked', data=train_df, ax=axis1)

sns.countplot(x='Survived', hue="Embarked", data=train_df, order=[1,0], ax=axis2)

embark_perc = train_df[["Embarked", "Survived"]].groupby(['Embarked'],as_index=False).mean()

sns.barplot(x='Embarked', y='Survived', data=embark_perc,order=['S','C','Q'],ax=axis3)

embark_dummies_train  = pd.get_dummies(train_df['Embarked'])

embark_dummies_train.drop(['S'], axis=1, inplace=True)

embark_dummies_test  = pd.get_dummies(test_df['Embarked'])

embark_dummies_test.drop(['S'], axis=1, inplace=True)

train_df = train_df.join(embark_dummies_train)

test_df    = test_df.join(embark_dummies_test)

train_df.drop(['Embarked'], axis=1,inplace=True)

test_df.drop(['Embarked'], axis=1,inplace=True)

# In[132]:

test_df["Fare"].fillna(test_df["Fare"].median(), inplace=True)

train_df['Fare'] = train_df['Fare'].astype(int)

test_df['Fare']    = test_df['Fare'].astype(int)

fare_not_survived = train_df["Fare"][train_df["Survived"] == 0]

fare_survived     = train_df["Fare"][train_df["Survived"] == 1]

avgerage_fare = DataFrame([fare_not_survived.mean(), fare_survived.mean()])

std_fare      = DataFrame([fare_not_survived.std(), fare_survived.std()])

#plot

train_df['Fare'].plot(kind='hist', figsize=(15,3),bins=100, xlim=(0,50))

avgerage_fare.index.names = std_fare.index.names = ["Survived"]

avgerage_fare.plot(yerr=std_fare,kind='bar',legend=False)

# In[133]:

# Age 

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

axis1.set_title('Original Age values - Titanic')

axis2.set_title('New Age values - Titanic')

average_age_titanic   = train_df["Age"].mean()

std_age_titanic       = train_df["Age"].std()

count_nan_age_titanic = train_df["Age"].isnull().sum()

# get average, std, and number of NaN values in test_df

average_age_test   = test_df["Age"].mean()

std_age_test       = test_df["Age"].std()

count_nan_age_test = test_df["Age"].isnull().sum()

# generate random numbers between (mean - std) & (mean + std)

rand_1 = np.random.randint(average_age_titanic - std_age_titanic, average_age_titanic + std_age_titanic, size = count_nan_age_titanic)

rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test)

# plot original Age values

# NOTE: drop all null values, and convert to int

train_df['Age'].dropna().astype(int).hist(bins=70, ax=axis1)

# test_df['Age'].dropna().astype(int).hist(bins=70, ax=axis1)

# fill NaN values in Age column with random values generated

train_df["Age"][np.isnan(train_df["Age"])] = rand_1

test_df["Age"][np.isnan(test_df["Age"])] = rand_2

# convert from float to int

train_df['Age'] = train_df['Age'].astype(int)

test_df['Age']    = test_df['Age'].astype(int)

# plot new Age Values

train_df['Age'].hist(bins=70, ax=axis2)

# test_df['Age'].hist(bins=70, ax=axis4)

# In[134]:

# .... continue with plot Age column

# peaks for survived/not survived passengers by their age

facet = sns.FacetGrid(train_df, hue="Survived",aspect=4)

facet.map(sns.kdeplot,'Age',shade= True)

facet.set(xlim=(0, train_df['Age'].max()))

facet.add_legend()

# average survived passengers by age

fig, axis1 = plt.subplots(1,1,figsize=(18,4))

average_age = train_df[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()

sns.barplot(x='Age', y='Survived', data=average_age)

# In[135]:

# Cabin

# It has a lot of NaN values, so it won't cause a remarkable impact on prediction

train_df.drop("Cabin",axis=1,inplace=True)

test_df.drop("Cabin",axis=1,inplace=True)

# Family

# Instead of having two columns Parch & SibSp,

# we can have only one column represent if the passenger had any family member aboard or not,

# Meaning, if having any family member(whether parent, brother, ...etc) will increase chances of Survival or not.

train_df['Family'] =  train_df["Parch"] + train_df["SibSp"]

train_df['Family'].loc[train_df['Family'] > 0] = 1

train_df['Family'].loc[train_df['Family'] == 0] = 0

test_df['Family'] =  test_df["Parch"] + test_df["SibSp"]

test_df['Family'].loc[test_df['Family'] > 0] = 1

test_df['Family'].loc[test_df['Family'] == 0] = 0

# drop Parch & SibSp

train_df = train_df.drop(['SibSp','Parch'], axis=1)

test_df    = test_df.drop(['SibSp','Parch'], axis=1)

# plot

fig, (axis1,axis2) = plt.subplots(1,2,sharex=True,figsize=(10,5))

# sns.factorplot('Family',data=train_df,kind='count',ax=axis1)

sns.countplot(x='Family', data=train_df, order=[1,0], ax=axis1)

# average of survived for those who had/didn't have any family member

family_perc = train_df[["Family", "Survived"]].groupby(['Family'],as_index=False).mean()

sns.barplot(x='Family', y='Survived', data=family_perc, order=[1,0], ax=axis2)

axis1.set_xticklabels(["With Family","Alone"], rotation=0)

# In[136]:

# Sex

# As we see, children(age < ~16) on aboard seem to have a high chances for Survival.

# So, we can classify passengers as males, females, and child

def get_person(passenger):

    age,sex = passenger

    return 'child' if age < 16 else sex

train_df['Person'] = train_df[['Age','Sex']].apply(get_person,axis=1)

test_df['Person']    = test_df[['Age','Sex']].apply(get_person,axis=1)

# No need to use Sex column since we created Person column

train_df.drop(['Sex'],axis=1,inplace=True)

test_df.drop(['Sex'],axis=1,inplace=True)

# create dummy variables for Person column, & drop Male as it has the lowest average of survived passengers

person_dummies_titanic  = pd.get_dummies(train_df['Person'])

person_dummies_titanic.columns = ['Child','Female','Male']

person_dummies_titanic.drop(['Male'], axis=1, inplace=True)

person_dummies_test  = pd.get_dummies(test_df['Person'])

person_dummies_test.columns = ['Child','Female','Male']

person_dummies_test.drop(['Male'], axis=1, inplace=True)

train_df = train_df.join(person_dummies_titanic)

test_df    = test_df.join(person_dummies_test)

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(10,5))

# sns.factorplot('Person',data=train_df,kind='count',ax=axis1)

sns.countplot(x='Person', data=train_df, ax=axis1)

# average of survived for each Person(male, female, or child)

person_perc = train_df[["Person", "Survived"]].groupby(['Person'],as_index=False).mean()

sns.barplot(x='Person', y='Survived', data=person_perc, ax=axis2, order=['male','female','child'])

train_df.drop(['Person'],axis=1,inplace=True)

test_df.drop(['Person'],axis=1,inplace=True)

# In[137]:

# Pclass

# sns.factorplot('Pclass',data=train_df,kind='count',order=[1,2,3])

sns.factorplot('Pclass','Survived',order=[1,2,3], data=train_df,size=5)

# create dummy variables for Pclass column, & drop 3rd class as it has the lowest average of survived passengers

pclass_dummies_titanic  = pd.get_dummies(train_df['Pclass'])

pclass_dummies_titanic.columns = ['Class_1','Class_2','Class_3']

pclass_dummies_titanic.drop(['Class_3'], axis=1, inplace=True)

pclass_dummies_test  = pd.get_dummies(test_df['Pclass'])

pclass_dummies_test.columns = ['Class_1','Class_2','Class_3']

pclass_dummies_test.drop(['Class_3'], axis=1, inplace=True)

train_df.drop(['Pclass'],axis=1,inplace=True)

test_df.drop(['Pclass'],axis=1,inplace=True)

train_df = train_df.join(pclass_dummies_titanic)

test_df    = test_df.join(pclass_dummies_test)

# In[139]:

# define training and testing sets

X_train = train_df.drop("Survived",axis=1)

Y_train = train_df["Survived"]

X_test  = test_df.drop("PassengerId",axis=1).copy()

# In[140]:

# Logistic Regression

logreg = LogisticRegression()

logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

logreg.score(X_train, Y_train)

# In[141]:

# Support Vector Machines

svc = SVC()

svc.fit(X_train, Y_train)

Y_pred = svc.predict(X_test)

svc.score(X_train, Y_train)

# In[142]:

# Random Forests

random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(X_train, Y_train)

Y_pred = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)

# In[143]:

# get Correlation Coefficient for each feature using Logistic Regression

coeff_df = DataFrame(train_df.columns.delete(0))

coeff_df.columns = ['Features']

coeff_df["Coefficient Estimate"] = pd.Series(logreg.coef_[0])

# preview

coeff_df

# In[ ]:

Classification and Prediction的更多相关文章

【转】Windows下使用libsvm中的grid.py和easy.py进行参数调优
libsvm中有进行参数调优的工具grid.py和easy.py可以使用,这些工具可以帮助我们选择更好的参数,减少自己参数选优带来的烦扰. 所需工具:libsvm.gnuplot 本机环境:Windo ...
A Complete Tutorial on Tree Based Modeling from Scratch (in R & Python)
A Complete Tutorial on Tree Based Modeling from Scratch (in R & Python) MACHINE LEARNING PYTHON ...
Applied Deep Learning Resources
Applied Deep Learning Resources A collection of research articles, blog posts, slides and code snipp ...
LIBSVM的使用方法
[原文:http://wenku.baidu.com/view/7e7b6b896529647d27285276.html] 目录 1 Libsvm下载... 3 2 Libsvm3.0环境变量设 ...
What is machine learning?
What is machine learning? One area of technology that is helping improve the services that we use on ...
高数量类别特征（high-cardinality categorical attributes）的预处理方法
high-cardinality categorical attributes,从字面上理解,即对于某个category特征,不同值的数量非常多,这里暂且把它叫做高数量类别属性.反之,即低数量类别属性 ...
机器学习基石8-Noise and Error
注: 文章中所有的图片均来自台湾大学林轩田<机器学习基石>课程. 笔记原作者:红色石头微信公众号:AI有道上一节课,我们主要介绍了VC Dimension的概念.如果Hypothese ...
Intel DAAL AI加速 ——传统决策树和随机森林
# file: dt_cls_dense_batch.py #===================================================================== ...
liblinear参数及使用方法（原创）
开发语言:JAVA 开发工具:eclipse (下载地址 http://www.eclipse.org/downloads/) liblinear版本:liblinear-1.94.jar (下载地址 ...

随机推荐

ACM学习历程—SNNUOJ 1239 Counting Star Time(树状数组 && 动态规划 && 数论)
http://219.244.176.199/JudgeOnline/problem.php?id=1239 这是这次陕西省赛的G题,题目大意是一个n*n的点阵,点坐标从(1, 1)到(n, n),每 ...
ASP.NET Cache缓存的使用
ASP.NET Cache是提升系统性能的重要方法,它使用了“最近使用”原则(a least-recently-used algorithm).在数据库访问中经常会用到Cache保存数据库数据. 1. ...
调用 SSPI 失败，请参见内部异常解决方法
2017-11-12 12:49:53:706] OnServerConnectionAvailable error : System.Security.Authentication.Authenti ...
遍历listmap 遍历map
package excel; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import j ...
用css来写一个背景图片的切换
代码如下: <!DOCTYPE HTML> <htmllang="en-US"> <head> <meta charset="U ...
MATLAB01
在命令行窗口输入edit就会进入代码编写区,编写完毕点击运行后会先进行保存,然后再执行代码,保存时候一定要以英文开头. 数组创建矩阵: 函数名描述 zero(m,n) 创建m行n列全零矩阵 one ...
Linux盘符漂移问题
Linux管理多块磁盘时(以sata盘为例),会按磁盘加载的顺序依次给磁盘命名为/dev/sda, /dev/sdb... 这种命名规则就会导致,一块磁盘在发生热插拔或系统重启后,盘符可能发生变化,会 ...
java流类，快速统计出字符次数+++
总结:学会给一个合适的命名,不要总是abc..虽然简单,但是不容易看懂,和方便去理解 package com.aini; import java.io.File; import java.io.Fil ...
1127 ZigZagging on a Tree
题意:中序序列+后序序列构建二叉树,之字形输出其层序序列. 思路:在结点的数据域中额外增加一个layer表示结点所在的层次,并定义vector<int> zigzag[maxn]存放最终结 ...
1096 Consecutive Factors
题意: 给出一个正整数N,找到最长的连续的可分解因子.如N=630,可被分解为630=3*5*6*7,其中5*6*7是3个连续的因子. 思路: 首先,需要明确,对于任何一个整数,如果它是素数,则不可被 ...

Classification and Prediction

Classification and Prediction的更多相关文章

随机推荐

热门专题