我的代码-statistic analysis

# coding: utf-8

# In[1]:

# numpy and pandas for data manipulation
import numpy as np
import pandas as pd

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# In[3]:

# Test data features
app_test = pd.read_csv(r'D:\Users\sgg91044\Desktop\MEP_no_defect_data_pivot.csv')
print('Testing data shape: ', app_test.shape)
app_test.head(20000)

# In[4]:

app_test['Target'].value_counts()
app_test['Target'].astype(int).plot.hist();

# In[5]:

# Function to calculate missing values by column# Funct
def missing_values_table(app_test):
# Total missing values
mis_val = app_test.isnull().sum()

# Percentage of missing values
mis_val_percent = 100 * app_test.isnull().sum() / len(app_test)

# Make a table with the results
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

# Rename the columns
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : 'Missing Values', 1 : '% of Total Values'})

# Sort the table by percentage of missing descending
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)

# Print some summary information
print ("Your selected dataframe has " + str(app_test.shape[1]) + " columns.\n"
"There are " + str(mis_val_table_ren_columns.shape[0]) + " columns that have missing values.")

# Return the dataframe with missing information
return mis_val_table_ren_columns

# In[6]:

# Missing values statistics
missing_values = missing_values_table(app_test)
missing_values.head(20)

# In[7]:

RR13_MAX_median = app_test['RR13_MAX.'].median()
ETCM_PHC4_median = app_test['ETCM_PHC4'].median()
HELK_MEAN_median = app_test['HELK_MEAN'].median()
PBK4_median = app_test['PBK4'].median()
ETCM_PHB4_median = app_test['ETCM_PHB4'].median()
ETCM_PHA4_median = app_test['ETCM_PHA4'].median()
THR3_MAX_median = app_test['THR3_MAX.'].median()
THR3_MEAN_median = app_test['THR3_MEAN'].median()
RR23_MEAN_median = app_test['RR23_MEAN'].median()
RR13_MEAN_median = app_test['RR13_MEAN'].median()
THR3_MEAN_DIFF_median = app_test['THR3_MEAN_DIFF'].median()
THR3_MEAN_SLOPE_median = app_test['THR3_MEAN_SLOPE'].median()
THR3_MEAN_SLOPE_median = app_test['THR3_MEAN_SLOPE'].median()
THR3_MAX_DIFF_median = app_test['THR3_MAX._DIFF'].median()
LOWERCHM_PRESS_DIFF_median = app_test['LOWERCHM_PRESS'].median()
HELK_MAX_median = app_test['HELK_MAX.'].median()
#HELK_MIN_median = app_test['HELK_MIN.'].median()
HELK_SD_median = app_test['HELK_SD'].median()
THR3_SD_median = app_test['THR3_SD'].median()
RR23_MAX_median = app_test['RR23_MAX.'].median()
#RR13_MAX_median
#ETCM_PHC4_median
#HELK_MEAN_median
#PBK4_median
#ETCM_PHB4_median
#ETCM_PHA4_median
#THR3_MAX_median
#THR3_MEAN_median
#RR23_MEAN_median
#RR13_MEAN_median
#THR3_MEAN_DIFF_median
#THR3_MEAN_SLOPE_median
#THR3_MEAN_SLOPE_median
#THR3_MAX_DIFF_median
#LOWERCHM_PRESS_DIFF_median
#HELK_MAX_median
#HELK_MIN_median
#HELK_SD_median
#THR3_SD_median

# In[8]:

app_test=app_test.fillna({'RR13_MAX.':RR13_MAX_median,
'ETCM_PHC4':ETCM_PHC4_median,
'HELK_MEAN':HELK_MEAN_median,
'PBK4':PBK4_median,
'ETCM_PHB4':ETCM_PHB4_median,
'ETCM_PHA4':ETCM_PHA4_median,
'THR3_MAX.':THR3_MAX_median,
'THR3_MEAN':THR3_MEAN_median,
'RR23_MEAN':RR23_MEAN_median,
'RR13_MEAN':RR13_MEAN_median,
'THR3_MEAN_DIFF':THR3_MEAN_DIFF_median,
'THR3_MEAN_SLOPE':THR3_MEAN_SLOPE_median,
'THR3_MEAN_SLOPE':THR3_MEAN_SLOPE_median,
'THR3_MAX._DIFF':THR3_MAX_DIFF_median,
'LOWERCHM_PRESS':LOWERCHM_PRESS_DIFF_median,
'HELK_MAX.':HELK_MAX_median,

'HELK_SD':HELK_SD_median,
'THR3_SD':THR3_SD_median,
'RR23_MAX.':RR23_MAX_median
})
app_test

# In[9]:

# Find correlations with the target and sort
correlations = app_test.corr()['Target'].sort_values()

# Display correlations
print('Most Positive Correlations:\n', correlations.tail(20))
print('\nMost Negative Correlations:\n', correlations.head(20))

# In[10]:

# Measure the correlation of parameter'ETCM_PHC4' and target

plt.figure(figsize = (20, 8))

# KDE plot of wafers that were non-defected
sns.kdeplot(app_test.loc[app_test['Target'] == 0, 'ETCM_PHC4'],label = 'target == 0')
# KDE plot of wafers that were defected
sns.kdeplot(app_test.loc[app_test['Target'] == 1, 'ETCM_PHC4'],label = 'target == 1')

# Labeling of plot
plt.xlabel('ETCM_PHC4'); plt.ylabel('Density'); plt.title('Distribution of ETCM_PHC4');

# In[11]:

sns.boxplot(x='HELK_SD', y='Target', data=app_test)

sns.plot.show()

# In[ ]:

# Scatter Plot
app_test.plot(kind='scatter', x='PBK4', y='Target',alpha = 0.5,color = 'red')
plt.xlabel('PBK4') # label = name of label
plt.ylabel('Target')
plt.title('PBK4 Target Scatter Plot')

# In[ ]:

# Measure the correlation of parameter'HELK_MEAN' and target

plt.figure(figsize = (20, 8))

# KDE plot of wafers that were non-defected
sns.kdeplot(app_test.loc[app_test['Target'] == 0, 'HELK_MEAN'],label = 'target == 0')
# KDE plot of wafers that were defected
sns.kdeplot(app_test.loc[app_test['Target'] == 1, 'HELK_MEAN'],label = 'target == 1')

# Labeling of plot
plt.xlabel('HELK_MEAN'); plt.ylabel('Density'); plt.title('Distribution of HELK_MEAN');

# In[ ]:

plt.figure(figsize = (15, 50))
# iterate through the new features
for i, feature in enumerate(['LOWERCHM_PRESS',
'RR13_MEAN',
'RR13_MAX.',
'RR23_MEAN',
'THR3_MAX.',
'THR3_MEAN',
'RR23_MAX.',
'PBK4',
'THR3_MEAN_DIFF',
'HELK_MEAN',
'THR3_MEAN_SLOPE',
'THR3_MAX._DIFF']):

# create a new subplot for each source
plt.subplot(13, 1, i + 1)
# plot non_defected wafer
sns.kdeplot(app_test.loc[app_test['Target'] == 0, feature], label = 'Target == 0')
# plot defected wafer
sns.kdeplot(app_test.loc[app_test['Target'] == 1, feature], label = 'Target == 1')

# Label the plots
plt.title('Distribution of %s by Target Value' % feature)
plt.xlabel('%s' % feature); plt.ylabel('Density');

plt.tight_layout(h_pad = 0.1)

# In[ ]:

# Bin the parameter'ETCM_PHC4' data
ETCM_PHC4 = app_test[['Target', 'ETCM_PHC4']]
ETCM_PHC4['VALUE_BINNED'] = pd.cut(ETCM_PHC4['ETCM_PHC4'], bins = np.linspace(200, 1000, num = 17))
ETCM_PHC4.head(20)

# In[ ]:

# Group by the bin and calculate averages
ETCM_PHC4_groups = ETCM_PHC4.groupby('VALUE_BINNED').mean()
ETCM_PHC4_groups

# In[ ]:

plt.figure(figsize = (8, 8))

# Graph the age bins and the average of the target as a bar plot
plt.bar(ETCM_PHC4_groups.index.astype(str), 100 * ETCM_PHC4_groups['Target'])

# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('ETCM_PHC4 value groups'); plt.ylabel('Possibility to be defected (%)')
plt.title('Possibility to be defected by ETCM_PHC4');

# In[ ]:

# Bin the parameter'HELK_MEAN' data
HELK_MEAN = app_test[['Target', 'HELK_MEAN']]
HELK_MEAN['VALUE_BINNED'] = pd.cut(HELK_MEAN['HELK_MEAN'], bins = np.linspace(0, 17.5, num = 8))
plt.figure(figsize = (8, 8))

# Group by the bin and calculate averages
HELK_MEAN_groups = HELK_MEAN.groupby('VALUE_BINNED').mean()
# Graph the age bins and the average of the target as a bar plot
plt.bar(HELK_MEAN_groups.index.astype(str), 100 * HELK_MEAN_groups['Target'])

# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('HELK_MEAN value groups'); plt.ylabel('Possibility to be defected (%)')
plt.title('Possibility to be defected by HELK_MEAN');

# In[13]:

# Extract the parameters variables and show correlations
ext_data = app_test[['Target', 'RR13_MAX.',
'ETCM_PHC4' ,
'HELK_MEAN',
'PBK4',
'ETCM_PHB4' ,
'ETCM_PHA4',
'HELK_MAX.',
'HELK_SD',
'THR3_SD' ]]
ext_data_corrs = ext_data.corr()
ext_data_corrs

# In[14]:

plt.figure(figsize = (10, 10))

# Heatmap of correlations
sns.heatmap(ext_data_corrs, cmap = plt.cm.RdYlBu_r, vmin =0, annot = True, vmax = 1)
plt.title('Correlation Heatmap');

# In[ ]:

plt.figure(figsize = (10, 12))

# iterate through the sources
for i, source in enumerate(['ETCM_PHA4', 'ETCM_PHB4', 'ETCM_PHC4']):

# create a new subplot for each source
plt.subplot(3, 1, i + 1)
# plot non-defected
sns.kdeplot(app_test.loc[app_test['Target'] == 0, source], label = 'target == 0',shade=True)
# plot defected
sns.kdeplot(app_test.loc[app_test['Target'] == 1, source], label = 'target == 1',shade=True)

# Label the plots
plt.title('Distribution of %s by Target Value' % source)
plt.xlabel('%s' % source); plt.ylabel('Density');

plt.tight_layout(h_pad = 2.5)

# In[ ]:

# Copy the data for plotting
plot_data = ext_data.drop(columns = ['RR13_MAX.',
'HELK_MAX.',
'HELK_SD',
'THR3_SD' ]).copy()

# Function to calculate correlation coefficient between two columns
def corr_func(x, y, **kwargs):
r = np.corrcoef(x, y)[0][1]
ax = plt.gca()
ax.annotate("r = {:.2f}".format(r),
xy=(.2, .8), xycoords=ax.transAxes,
size = 20)

# Create the pairgrid object
grid = sns.PairGrid(data = plot_data, size = 3, diag_sharey=False,
hue = 'Target',
vars = [x for x in list(plot_data.columns) if x != 'Target'])

# Upper is a scatter plot
grid.map_upper(plt.scatter, alpha = 0.2)

# Diagonal is a histogram
grid.map_diag(sns.kdeplot)

# Bottom is density plot
grid.map_lower(sns.kdeplot, cmap = plt.cm.OrRd_r);

plt.suptitle('Ext Source Features Pairs Plot', size = 32, y = 1.05);

# In[12]:

import seaborn as sns
# Create the default pairplot
sns.pairplot(app_test)

# In[224]:

app_test.to_csv(r'D:\Users\sgg91044\Downloads\SEDA\CTM_data.csv', index=True, header=True)

# In[243]:

from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split

CTM_data=pd.read_csv(r'D:\Users\sgg91044\Downloads\SEDA\CTM_data.csv')
# Median imputation of missing values
#RR23_MAX_median = CTM_data['RR23_MAX.'].median()
#CTM_data=CTM_data.fillna({'RR23_MAX.': RR23_MAX_median})

# Drop the target from the training data
#CTM_data['Target_C']=CTM_data['Target'].astype('category')
#CTM_data['Target_C'].cat.categories=['noMEP','MEP']
#le = LabelEncoder()
#le_count = 0
#le.fit(CTM_data['Target_C'])
#CTM_data['Target_C'] = le.transform(CTM_data['Target_C'])
CTM_data['Target']=CTM_data['Target'].astype('float')
CTM_data_Target= CTM_data[['Target']]
CTM_data_Columns = CTM_data.drop(columns = ['Target'])
X_train, X_test, y_train, y_test = train_test_split(CTM_data_Columns,CTM_data_Target,test_size=0.3, random_state=0)

# In[244]:

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

# In[245]:

print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

# In[228]:

#Y_test = test[['Target_C']]
#Y_train = test.drop(columns = ['Target','Target_C'])

# Feature names
#features = list(train.columns)
#y, _ = pd.factorize(app_train['Target'])

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
#imputer.fit(train)

# Transform both training and testing data
#train = imputer.transform(train)
#test = imputer.transform(test)

# Repeat with the scaler
scaler.fit(X_train)
X_train = scaler.transform(X_train)
scaler.fit(X_test)
X_test = scaler.transform(X_test)
print('Training data shape: ', X_train.shape)
print('Testing data shape: ', X_test.shape)

# In[229]:

from sklearn.ensemble import RandomForestClassifier

# Make the random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)

# In[203]:

y_test

# In[234]:

# Train on the training data
random_forest.fit(X_train,y_train)

# Extract feature importances
feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': features, 'importance': feature_importance_values})

# Make predictions on the test data
predictions = random_forest.predict(X_test)

# In[232]:

predictions
predictions.shape

# In[233]:

import pandas as pd
# 第一个参数是行索引，第二个属性为列索引
print( pd.crosstab(y_test['Target'], predictions, rownames=['Target'], colnames=['preds']))
# Make a submission dataframe

# In[165]:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split,cross_val_score

# In[190]:

cross_val_score(random_forest,y_test,submit )

# In[235]:

def plot_feature_importances(df):
"""
Plot importances returned by a model. This can work with any measure of
feature importance provided that higher importance is better.

Args:
df (dataframe): feature importances. Must have the features in a column
called `features` and the importances in a column called `importance

Returns:
shows a plot of the 15 most importance features

df (dataframe): feature importances sorted by importance (highest to lowest)
with a column for normalized importance
"""

# Sort features according to importance
df = df.sort_values('importance', ascending = False).reset_index()

# Normalize the feature importances to add up to one
df['importance_normalized'] = df['importance'] / df['importance'].sum()

# Make a horizontal bar chart of feature importances
plt.figure(figsize = (10, 6))
ax = plt.subplot()

# Need to reverse the index to plot most important on top
ax.barh(list(reversed(list(df.index[:20]))),
df['importance_normalized'].head(20),
align = 'center', edgecolor = 'k')

# Set the yticks and labels
ax.set_yticks(list(reversed(list(df.index[:20]))))
ax.set_yticklabels(df['feature'].head(20))

# Plot labeling
plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
plt.show()

return df

# In[236]:

feature_importances_sorted = plot_feature_importances(feature_importances)

我的代码-statistic analysis的更多相关文章

Web 检测代码 web analysis 开源 open source
1. Grape Web Statistics Grape Web Statistics is a fairly simple piece of analytics software. Grape i ...
Android内存等信息
1. Linux中proc目录下文件详解 http://wenku.baidu.com/view/2ce89f00a6c30c2259019ef1.html 2. Android系统/proc目录详解 ...
结构-行为-样式-angularJs 指令实现滚动文字
最近在做XX项目的大屏展示页面,有一个表格需要用到这个滚动效果,于是就写了个指令,记录下,共同学习. Html代码: <td word-roll tword="item"&g ...
以太坊智能合约虚拟机(EVM)原理与实现
以太坊 EVM原理与实现以太坊底层通过EVM模块支持合约的执行与调用,调用时根据合约地址获取到代码,生成环境后载入到EVM中运行.通常智能合约的开发流程是用solidlity编写逻辑代码,再通过编译 ...
20165227 2017-2018-2《Java程序设计》课程总结
20165227 2017-2018-2<Java程序设计>课程总结每周作业链接汇总预备作业1 简要内容: 记忆深刻的老师我期望的师生关系对于Java学习的看法预备作业2 简要内 ...
C++解析头文件-Qt自动生成信号定义
目录一.概述二.实现思路三.代码讲解 1.类图 2.QtCppDescription 3.测试四.源代码一.概述上一篇文章C++解析头文件-Qt自动生成信号声明我们主要讲解了怎么去解析C+ ...
344. Reverse String【easy】
344. Reverse String[easy] Write a function that takes a string as input and returns the string rever ...
手记系列之二 ----- 关于IDEA的一些使用方法经验
前言本篇文章主要介绍的关于本人在使用IDEA的一些使用方法,一些常用设置,一些插件推荐和使用.请注意,本文特长,2w多字加上几十张图片,建议收藏观看~ 前提准备 idea官网: https://ww ...
数据关联分析 association analysis (Aprior算法，python代码）
1基本概念购物篮事务(market basket transaction),如下表,表中每一行对应一个事务,包含唯一标识TID,和购买的商品集合.本文介绍一种成为关联分析(association a ...

随机推荐

关于SQLSERVER数据库连接池
页内导航 1.如何开启连接池? 2. 那连接池是和有什么有关呢? 3.如何使用相同的连接池访问不同的数据库? ‘关于数据库连接池大家都听说过或者用过,但真正的了解有多少呢? 数据连接池如何启用?有哪些 ...
一条命令关掉centos所有不必要的服务和端口号
centos作为服务器开放的服务多了,难免一些服务软件有漏洞,开放的端口号越多,上线的服务器越危险,所以我们必须在服务器上线之前把centos里面不必要的服务全部干掉,不让坏人有可乘之机. 首先看一下 ...
JDBCUtils——C3P0
需要导入的包: mysql-connector-java-5.1.37-bin.jar c3p0-0.9.2-pre5.jar mchange-commons-java-0.2.3.jar 如果使用D ...
mysql5.7.20 windows 解压缩版安装
1.下载文件下载路径:https://dev.mysql.com/downloads/mysql/ 2.配置文件在解压的文件夹内新建my.ini文件,并加入以下内容: [mysql] # 设置my ...
linux中常见的命令
linux 中的命令非常多,但是玩过linux的人也从来不会因为Linux的命令如此之多而烦恼,我们只需要掌握其中常见的命令即可,可以在使用时去找man,会帮助你解决不少问题.下面就列出一些常见的li ...
练习 map集合被使用是因为具备映射关系 "进度班" "01" "张三" "进度班" "02" "李四" "J1701" "01" "王五" "J1701" "02" "王二" 此信息中，我们要怎样把上述信息装入集合中，根据班级信息的到所有的所有信
package com.rf.xs; import java.util.Arrays; public class Student01 { String name; int age; public St ...
JAVA Number与Math类
Number类: 当要用到数字的时候,我们除了使用内置数据类型byte,int,double等来声明,我们还把它声明为一个对象: 所有的包装类(Integer.Long.Byte.Double.Flo ...
Oracle学习DayFive（PL/SQL）
一.PL/SQL简介 PL/SQL 是 Procedure Language & Structured Query Language 的缩写.PL/SQL 是对 SQL 语言存储过程语言的扩 ...
tensorFlow入门实践（三）初识AlexNet实现结构
参考黄文坚<TensorFlow实战>一书,完成AlexNet的整体实现并展望其训练和预测过程. import tensorflow as tf batch_size = 32 num_b ...
移动端调试神器 whistle
移动端h5页面嵌入的方式多种多样,普通的chrome真机调试功能有时就不能满足要求了. whistle通过设置wifi服务器代理的方式,可以抓包调试所有移动端请求的页面. whistle的github ...

我的代码-statistic analysis

我的代码-statistic analysis的更多相关文章

随机推荐

热门专题