# coding: utf-8

# In[128]:

get_ipython().magic(u'matplotlib inline')
import pandas as pd
from pandas import Series,DataFrame
import seaborn as sns
sns.set_style('whitegrid')
pd.set_option('display.mpl_style', 'default')
import numpy as np
import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB train_df= pd.read_csv("/home/lpstudy/下载/train.csv")
test_df = pd.read_csv("/home/lpstudy/下载/test.csv") train_df.head() test_df.head() # In[129]: train_df = train_df.drop(["Ticket","PassengerId","Name"],axis = 1)
test_df = test_df.drop(["Name","Ticket"],axis =1) # In[130]: train_df.head() # In[131]: train_df["Embarked"] = train_df["Embarked"].fillna("S")
#plot
sns.factorplot("Embarked","Survived",data = train_df,size = 6,aspect = 2) fig,(axis1,axis2,axis3) = plt.subplots(1,3,figsize = (15,5)) sns.countplot(x='Embarked', data=train_df, ax=axis1)
sns.countplot(x='Survived', hue="Embarked", data=train_df, order=[1,0], ax=axis2) embark_perc = train_df[["Embarked", "Survived"]].groupby(['Embarked'],as_index=False).mean()
sns.barplot(x='Embarked', y='Survived', data=embark_perc,order=['S','C','Q'],ax=axis3) embark_dummies_train = pd.get_dummies(train_df['Embarked'])
embark_dummies_train.drop(['S'], axis=1, inplace=True) embark_dummies_test = pd.get_dummies(test_df['Embarked'])
embark_dummies_test.drop(['S'], axis=1, inplace=True) train_df = train_df.join(embark_dummies_train)
test_df = test_df.join(embark_dummies_test) train_df.drop(['Embarked'], axis=1,inplace=True)
test_df.drop(['Embarked'], axis=1,inplace=True) # In[132]: test_df["Fare"].fillna(test_df["Fare"].median(), inplace=True) train_df['Fare'] = train_df['Fare'].astype(int)
test_df['Fare'] = test_df['Fare'].astype(int) fare_not_survived = train_df["Fare"][train_df["Survived"] == 0]
fare_survived = train_df["Fare"][train_df["Survived"] == 1] avgerage_fare = DataFrame([fare_not_survived.mean(), fare_survived.mean()])
std_fare = DataFrame([fare_not_survived.std(), fare_survived.std()]) #plot
train_df['Fare'].plot(kind='hist', figsize=(15,3),bins=100, xlim=(0,50)) avgerage_fare.index.names = std_fare.index.names = ["Survived"]
avgerage_fare.plot(yerr=std_fare,kind='bar',legend=False) # In[133]: # Age fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
axis1.set_title('Original Age values - Titanic')
axis2.set_title('New Age values - Titanic') average_age_titanic = train_df["Age"].mean()
std_age_titanic = train_df["Age"].std()
count_nan_age_titanic = train_df["Age"].isnull().sum() # get average, std, and number of NaN values in test_df
average_age_test = test_df["Age"].mean()
std_age_test = test_df["Age"].std()
count_nan_age_test = test_df["Age"].isnull().sum() # generate random numbers between (mean - std) & (mean + std)
rand_1 = np.random.randint(average_age_titanic - std_age_titanic, average_age_titanic + std_age_titanic, size = count_nan_age_titanic)
rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test) # plot original Age values
# NOTE: drop all null values, and convert to int
train_df['Age'].dropna().astype(int).hist(bins=70, ax=axis1)
# test_df['Age'].dropna().astype(int).hist(bins=70, ax=axis1) # fill NaN values in Age column with random values generated
train_df["Age"][np.isnan(train_df["Age"])] = rand_1
test_df["Age"][np.isnan(test_df["Age"])] = rand_2 # convert from float to int
train_df['Age'] = train_df['Age'].astype(int)
test_df['Age'] = test_df['Age'].astype(int) # plot new Age Values
train_df['Age'].hist(bins=70, ax=axis2)
# test_df['Age'].hist(bins=70, ax=axis4) # In[134]: # .... continue with plot Age column # peaks for survived/not survived passengers by their age
facet = sns.FacetGrid(train_df, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, train_df['Age'].max()))
facet.add_legend() # average survived passengers by age
fig, axis1 = plt.subplots(1,1,figsize=(18,4))
average_age = train_df[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()
sns.barplot(x='Age', y='Survived', data=average_age) # In[135]: # Cabin
# It has a lot of NaN values, so it won't cause a remarkable impact on prediction
train_df.drop("Cabin",axis=1,inplace=True)
test_df.drop("Cabin",axis=1,inplace=True) # Family # Instead of having two columns Parch & SibSp,
# we can have only one column represent if the passenger had any family member aboard or not,
# Meaning, if having any family member(whether parent, brother, ...etc) will increase chances of Survival or not.
train_df['Family'] = train_df["Parch"] + train_df["SibSp"]
train_df['Family'].loc[train_df['Family'] > 0] = 1
train_df['Family'].loc[train_df['Family'] == 0] = 0 test_df['Family'] = test_df["Parch"] + test_df["SibSp"]
test_df['Family'].loc[test_df['Family'] > 0] = 1
test_df['Family'].loc[test_df['Family'] == 0] = 0 # drop Parch & SibSp
train_df = train_df.drop(['SibSp','Parch'], axis=1)
test_df = test_df.drop(['SibSp','Parch'], axis=1) # plot
fig, (axis1,axis2) = plt.subplots(1,2,sharex=True,figsize=(10,5)) # sns.factorplot('Family',data=train_df,kind='count',ax=axis1)
sns.countplot(x='Family', data=train_df, order=[1,0], ax=axis1) # average of survived for those who had/didn't have any family member
family_perc = train_df[["Family", "Survived"]].groupby(['Family'],as_index=False).mean()
sns.barplot(x='Family', y='Survived', data=family_perc, order=[1,0], ax=axis2) axis1.set_xticklabels(["With Family","Alone"], rotation=0) # In[136]: # Sex # As we see, children(age < ~16) on aboard seem to have a high chances for Survival.
# So, we can classify passengers as males, females, and child
def get_person(passenger):
age,sex = passenger
return 'child' if age < 16 else sex train_df['Person'] = train_df[['Age','Sex']].apply(get_person,axis=1)
test_df['Person'] = test_df[['Age','Sex']].apply(get_person,axis=1) # No need to use Sex column since we created Person column
train_df.drop(['Sex'],axis=1,inplace=True)
test_df.drop(['Sex'],axis=1,inplace=True) # create dummy variables for Person column, & drop Male as it has the lowest average of survived passengers
person_dummies_titanic = pd.get_dummies(train_df['Person'])
person_dummies_titanic.columns = ['Child','Female','Male']
person_dummies_titanic.drop(['Male'], axis=1, inplace=True) person_dummies_test = pd.get_dummies(test_df['Person'])
person_dummies_test.columns = ['Child','Female','Male']
person_dummies_test.drop(['Male'], axis=1, inplace=True) train_df = train_df.join(person_dummies_titanic)
test_df = test_df.join(person_dummies_test) fig, (axis1,axis2) = plt.subplots(1,2,figsize=(10,5)) # sns.factorplot('Person',data=train_df,kind='count',ax=axis1)
sns.countplot(x='Person', data=train_df, ax=axis1) # average of survived for each Person(male, female, or child)
person_perc = train_df[["Person", "Survived"]].groupby(['Person'],as_index=False).mean()
sns.barplot(x='Person', y='Survived', data=person_perc, ax=axis2, order=['male','female','child']) train_df.drop(['Person'],axis=1,inplace=True)
test_df.drop(['Person'],axis=1,inplace=True) # In[137]: # Pclass # sns.factorplot('Pclass',data=train_df,kind='count',order=[1,2,3])
sns.factorplot('Pclass','Survived',order=[1,2,3], data=train_df,size=5) # create dummy variables for Pclass column, & drop 3rd class as it has the lowest average of survived passengers
pclass_dummies_titanic = pd.get_dummies(train_df['Pclass'])
pclass_dummies_titanic.columns = ['Class_1','Class_2','Class_3']
pclass_dummies_titanic.drop(['Class_3'], axis=1, inplace=True) pclass_dummies_test = pd.get_dummies(test_df['Pclass'])
pclass_dummies_test.columns = ['Class_1','Class_2','Class_3']
pclass_dummies_test.drop(['Class_3'], axis=1, inplace=True) train_df.drop(['Pclass'],axis=1,inplace=True)
test_df.drop(['Pclass'],axis=1,inplace=True) train_df = train_df.join(pclass_dummies_titanic)
test_df = test_df.join(pclass_dummies_test) # In[139]: # define training and testing sets X_train = train_df.drop("Survived",axis=1)
Y_train = train_df["Survived"]
X_test = test_df.drop("PassengerId",axis=1).copy() # In[140]: # Logistic Regression logreg = LogisticRegression() logreg.fit(X_train, Y_train) Y_pred = logreg.predict(X_test) logreg.score(X_train, Y_train) # In[141]: # Support Vector Machines svc = SVC() svc.fit(X_train, Y_train) Y_pred = svc.predict(X_test) svc.score(X_train, Y_train) # In[142]: # Random Forests random_forest = RandomForestClassifier(n_estimators=100) random_forest.fit(X_train, Y_train) Y_pred = random_forest.predict(X_test) random_forest.score(X_train, Y_train) # In[143]: # get Correlation Coefficient for each feature using Logistic Regression
coeff_df = DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Features']
coeff_df["Coefficient Estimate"] = pd.Series(logreg.coef_[0]) # preview
coeff_df # In[ ]:

Classification and Prediction的更多相关文章

  1. 【转】Windows下使用libsvm中的grid.py和easy.py进行参数调优

    libsvm中有进行参数调优的工具grid.py和easy.py可以使用,这些工具可以帮助我们选择更好的参数,减少自己参数选优带来的烦扰. 所需工具:libsvm.gnuplot 本机环境:Windo ...

  2. A Complete Tutorial on Tree Based Modeling from Scratch (in R & Python)

    A Complete Tutorial on Tree Based Modeling from Scratch (in R & Python) MACHINE LEARNING PYTHON  ...

  3. Applied Deep Learning Resources

    Applied Deep Learning Resources A collection of research articles, blog posts, slides and code snipp ...

  4. LIBSVM的使用方法

    [原文:http://wenku.baidu.com/view/7e7b6b896529647d27285276.html] 目  录 1 Libsvm下载... 3 2 Libsvm3.0环境变量设 ...

  5. What is machine learning?

    What is machine learning? One area of technology that is helping improve the services that we use on ...

  6. 高数量类别特征(high-cardinality categorical attributes)的预处理方法

    high-cardinality categorical attributes,从字面上理解,即对于某个category特征,不同值的数量非常多,这里暂且把它叫做高数量类别属性.反之,即低数量类别属性 ...

  7. 机器学习基石8-Noise and Error

    注: 文章中所有的图片均来自台湾大学林轩田<机器学习基石>课程. 笔记原作者:红色石头 微信公众号:AI有道 上一节课,我们主要介绍了VC Dimension的概念.如果Hypothese ...

  8. Intel DAAL AI加速 ——传统决策树和随机森林

    # file: dt_cls_dense_batch.py #===================================================================== ...

  9. liblinear参数及使用方法(原创)

    开发语言:JAVA 开发工具:eclipse (下载地址 http://www.eclipse.org/downloads/) liblinear版本:liblinear-1.94.jar (下载地址 ...

随机推荐

  1. RESTful 服务示例

    WCF服务轻量级服务,可供JS调用 返回值格式:XML.Json 工程结构: 示例代码: using System; using System.Collections.Generic; using S ...

  2. LeetCode Partition to K Equal Sum Subsets

    原题链接在这里:https://leetcode.com/problems/partition-to-k-equal-sum-subsets/description/ 题目: Given an arr ...

  3. 如何在本地浏览器访问nginx

    1.打开vmware"编辑虚拟机"设置,点击“网络适配器”选择“桥联模式”: 2.开启该虚拟机,输入用户名root及密码登陆服务器: 3.以管理员身份打开cmd,在命令窗口输入ip ...

  4. YY一下十年后的自己(转)

    每到年底总是我最焦虑的时候,年龄越大情况越明显.可能越长大越是对 时光的流逝 更有感触,有感触之后就会胡思乱想.所以随手开始写下这篇文章. 人无远虑必有近忧.那么同学呀,你听说过安利么. 一直都有做总 ...

  5. winSCP连接FTP没有上传的权限

    错误: 原因: ftp用户为 1)查看ubantu中FTP文件夹目录所有者及权限,发现ftpName用户对FTP文件夹的权限为 “r-x”  ,仅有读,执行权限 2) chmod o=rwx ftp ...

  6. ①HttpURLConnection通过报文提交

    在进行短信发送的接口,因厂家不同,有的厂家会采用报文的格式进行短信请求的发送与接收.本文主要介绍利用HttpURLConnection进行短信报文的请求与响应. 一般的url请求分为两种,一种是GET ...

  7. Oracle GoldenGate理论

    1Oracle GoldenGate处理方法和支持的数据库Oracle GoldenGate在多样和异构的基础IT平台中,可以在事务级别上进行数据交换和数据操作.在保证交易完整性和最小的开销的条件下, ...

  8. Oracle配置文件tnsnames.ora新增链接后连接报错:ORA-12154: TNS:无法解析指定的标识符

    一个空格引发的血案:在tnsnames. ora文件中新加了一个配置,该配置估计当时是拷的别人的直接粘贴上去的,然后发现用pl/sql连接就一直报错了,后面排除了用户名和密码问题和后,仔细看了该文件才 ...

  9. erlang热部署

    以下流程参考rebar的wiki,亲测 rebar的版本一定要注意,高版本对于下面两个指令有bug rebar generate-appups rebar generate-upgrade 经过一个个 ...

  10. java中输入3个数,从大到小的输出。。。。

    总结:我暂时不能理解,C语言时讲过,java里就不理解了 package com.a; import java.sql.Date; import java.util.Scanner; //输入三个数, ...