吴裕雄--天生自然 python数据分析:医疗费数据分析

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as pl
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv('F:\\kaggleDataSet\\MedicalCostPersonal\\insurance.csv')
data.head()

data.isnull().sum()

from sklearn.preprocessing import LabelEncoder
#sex
le = LabelEncoder()
le.fit(data.sex.drop_duplicates())
data.sex = le.transform(data.sex)
# smoker or not
le.fit(data.smoker.drop_duplicates())
data.smoker = le.transform(data.smoker)
#region
le.fit(data.region.drop_duplicates())
data.region = le.transform(data.region)
data.corr()['charges'].sort_values()

f, ax = pl.subplots(figsize=(10, 8))
corr = data.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(240,10,as_cmap=True),square=True, ax=ax)

from bokeh.io import output_notebook, show
from bokeh.plotting import figure
output_notebook()
import scipy.special
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, output_file
p = figure(title="Distribution of charges",tools="save",background_fill_color="#E8DDCB")
hist, edges = np.histogram(data.charges)
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],fill_color="#036564", line_color="#033649")
p.xaxis.axis_label = 'x'
p.yaxis.axis_label = 'Pr(x)'
show(gridplot(p,ncols = 2, plot_width=400, plot_height=400, toolbar_location=None))

f= pl.figure(figsize=(12,5)) ax=f.add_subplot(121)
sns.distplot(data[(data.smoker == 1)]["charges"],color='c',ax=ax)
ax.set_title('Distribution of charges for smokers') ax=f.add_subplot(122)
sns.distplot(data[(data.smoker == 0)]['charges'],color='b',ax=ax)
ax.set_title('Distribution of charges for non-smokers')

sns.catplot(x="smoker", kind="count",hue = 'sex', palette="pink", data=data)

sns.catplot(x="sex", y="charges", hue="smoker",kind="violin", data=data, palette = 'magma')

pl.figure(figsize=(12,5))
pl.title("Box plot for charges of women")
sns.boxplot(y="smoker", x="charges", data = data[(data.sex == 1)] , orient="h", palette = 'magma')

pl.figure(figsize=(12,5))
pl.title("Box plot for charges of men")
sns.boxplot(y="smoker", x="charges", data = data[(data.sex == 0)] , orient="h", palette = 'rainbow')

pl.figure(figsize=(12,5))
pl.title("Distribution of age")
ax = sns.distplot(data["age"], color = 'g')

sns.catplot(x="smoker", kind="count",hue = 'sex', palette="rainbow", data=data[(data.age == 18)])
pl.title("The number of smokers and non-smokers (18 years old)")


g = sns.jointplot(x="age", y="charges", data = data[(data.smoker == 0)],kind="kde", color="m")
g.plot_joint(pl.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$")
ax.set_title('Distribution of charges and age for non-smokers')

g = sns.jointplot(x="age", y="charges", data = data[(data.smoker == 1)],kind="kde", color="c")
g.plot_joint(pl.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$")
ax.set_title('Distribution of charges and age for smokers')

#non - smokers
p = figure(plot_width=500, plot_height=450)
p.circle(x=data[(data.smoker == 0)].age,y=data[(data.smoker == 0)].charges, size=7, line_color="navy", fill_color="pink", fill_alpha=0.9) show(p)


#smokers
p = figure(plot_width=500, plot_height=450)
p.circle(x=data[(data.smoker == 1)].age,y=data[(data.smoker == 1)].charges, size=7, line_color="navy", fill_color="red", fill_alpha=0.9)
show(p)


sns.lmplot(x="age", y="charges", hue="smoker", data=data, palette = 'inferno_r', size = 7)
ax.set_title('Smokers and non-smokers')

pl.figure(figsize=(12,5))
pl.title("Distribution of bmi")
ax = sns.distplot(data["bmi"], color = 'm')

pl.figure(figsize=(12,5))
pl.title("Distribution of charges for patients with BMI greater than 30")
ax = sns.distplot(data[(data.bmi >= 30)]['charges'], color = 'm')

pl.figure(figsize=(12,5))
pl.title("Distribution of charges for patients with BMI less than 30")
ax = sns.distplot(data[(data.bmi < 30)]['charges'], color = 'b')

g = sns.jointplot(x="bmi", y="charges", data = data,kind="kde", color="r")
g.plot_joint(pl.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$")
ax.set_title('Distribution of bmi and charges')

pl.figure(figsize=(10,6))
ax = sns.scatterplot(x='bmi',y='charges',data=data,palette='magma',hue='smoker')
ax.set_title('Scatter plot of charges and bmi') sns.lmplot(x="bmi", y="charges", hue="smoker", data=data, palette = 'magma', size = 8)


sns.catplot(x="children", kind="count", palette="ch:.25", data=data, size = 6)

sns.catplot(x="smoker", kind="count", palette="rainbow",hue = "sex",
data=data[(data.children > 0)], size = 6)
ax.set_title('Smokers and non-smokers who have childrens')

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.ensemble import RandomForestRegressor
x = data.drop(['charges'], axis = 1)
y = data.charges x_train,x_test,y_train,y_test = train_test_split(x,y, random_state = 0)
lr = LinearRegression().fit(x_train,y_train) y_train_pred = lr.predict(x_train)
y_test_pred = lr.predict(x_test) print(lr.score(x_test,y_test))

X = data.drop(['charges','region'], axis = 1)
Y = data.charges quad = PolynomialFeatures (degree = 2)
x_quad = quad.fit_transform(X) X_train,X_test,Y_train,Y_test = train_test_split(x_quad,Y, random_state = 0) plr = LinearRegression().fit(X_train,Y_train) Y_train_pred = plr.predict(X_train)
Y_test_pred = plr.predict(X_test) print(plr.score(X_test,Y_test))

forest = RandomForestRegressor(n_estimators = 100,criterion = 'mse',random_state = 1,n_jobs = -1)
forest.fit(x_train,y_train)
forest_train_pred = forest.predict(x_train)
forest_test_pred = forest.predict(x_test) print('MSE train data: %.3f, MSE test data: %.3f' % (
mean_squared_error(y_train,forest_train_pred),
mean_squared_error(y_test,forest_test_pred)))
print('R2 train data: %.3f, R2 test data: %.3f' % (
r2_score(y_train,forest_train_pred),
r2_score(y_test,forest_test_pred)))

pl.figure(figsize=(10,6)) pl.scatter(forest_train_pred,forest_train_pred - y_train,c = 'black', marker = 'o', s = 35, alpha = 0.5,label = 'Train data')
pl.scatter(forest_test_pred,forest_test_pred - y_test,c = 'c', marker = 'o', s = 35, alpha = 0.7,label = 'Test data')
pl.xlabel('Predicted values')
pl.ylabel('Tailings')
pl.legend(loc = 'upper left')
pl.hlines(y = 0, xmin = 0, xmax = 60000, lw = 2, color = 'red')
pl.show()

吴裕雄--天生自然 python数据分析:医疗费数据分析的更多相关文章
- 吴裕雄--天生自然 PYTHON数据分析:糖尿病视网膜病变数据分析(完整版)
# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by ...
- 吴裕雄--天生自然 PYTHON数据分析:所有美国股票和etf的历史日价格和成交量分析
# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by ...
- 吴裕雄--天生自然 python数据分析:健康指标聚集分析(健康分析)
# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by ...
- 吴裕雄--天生自然 python数据分析:葡萄酒分析
# import pandas import pandas as pd # creating a DataFrame pd.DataFrame({'Yes': [50, 31], 'No': [101 ...
- 吴裕雄--天生自然 PYTHON数据分析:人类发展报告——HDI, GDI,健康,全球人口数据数据分析
import pandas as pd # Data analysis import numpy as np #Data analysis import seaborn as sns # Data v ...
- 吴裕雄--天生自然 PYTHON语言数据分析:ESA的火星快车操作数据集分析
import os import numpy as np import pandas as pd from datetime import datetime import matplotlib imp ...
- 吴裕雄--天生自然 python语言数据分析:开普勒系外行星搜索结果分析
import pandas as pd pd.DataFrame({'Yes': [50, 21], 'No': [131, 2]}) pd.DataFrame({'Bob': ['I liked i ...
- 吴裕雄--天生自然 PYTHON数据分析:基于Keras的CNN分析太空深处寻找系外行星数据
#We import libraries for linear algebra, graphs, and evaluation of results import numpy as np import ...
- 吴裕雄--天生自然 python数据分析:基于Keras使用CNN神经网络处理手写数据集
import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib.image as mp ...
随机推荐
- mybatis缓存,从一个“灵异”事件说起
刚准备下班走人,被一开发同事叫住,让帮看一个比较奇怪的问题:Mybatis同一个Mapper接口的查询方法,第一次返回与第二次返回结果不一样,百思不得其解! 问题 Talk is cheap. Sho ...
- java数据域初始化
1.在声明中赋值 /** * Created by N3verL4nd on 2016/11/19. */ class Test{ private String str = "Hello W ...
- WTL对话框添加背景图片
WTL91_5321_Final + VS2013 + WIN7 // MainDlg.h : interface of the CMainDlg class // ///////////////// ...
- 全卷积网络FCN
全卷积网络FCN fcn是深度学习用于图像分割的鼻祖.后续的很多网络结构都是在此基础上演进而来. 图像分割即像素级别的分类. 语义分割的基本框架: 前端fcn(以及在此基础上的segnet,decon ...
- HDU_2579_bfs
http://acm.split.hdu.edu.cn/showproblem.php?pid=2579 简单bfs题,刚开始在纠结怎么存放vis,因为步数可能有几百步,这么多格子开数组的话也太多了, ...
- BZOJ 4556(后缀数组+主席树求前驱后继+二分||后缀数组+二分+可持久化线段树)
换markdown写了.. 题意: 给你一个1e5的字符串,1e5组询问,求\([l_1,r_1]\)的所有子串与\([l_2,r_2]\)的lcp 思路: 首先可以发现答案是具有单调性的,我们考虑二 ...
- 数据结构与算法的实现(c++)之第一天
开发工具:codeblocks 17.12版本 学习视频来自b站 第一天:学习swap交换.冒泡排序 swap交换:swap是几乎所有的排序的最基础部分,代码如下: #include <iost ...
- 第2章 Java并行程序基础(二)
2.3 volatile 与 Java 内存模型(JMM) volatile对于保证操作的原子性是由非常大的帮助的(可见性).但是需要注意的是,volatile并不能代替锁,它也无法保证一些复合操作的 ...
- Matplotlib数据可视化从入门到精通(持续更新)
目录 前言 如何添加标题-title 如何添加文字-text 如何添加注释-annotate 如何设置坐标轴名称-xlabel/ylabel 如何添加图例-legend 如何调整颜色-color 如何 ...
- js垃圾回收与内存泄漏
js垃圾回收机制 概念: javascript具有自动垃圾收集机制,也就是说,执行环境会负责管理代码执行过程中的使用的内存.而在C和C++之类的语言中,开发人员的一项基本任务就是手动跟踪内存的使用情况 ...