# coding: utf-8

# In[1]:

# numpy and pandas for data manipulation
import numpy as np
import pandas as pd

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# In[3]:

# Test data features
app_test = pd.read_csv(r'D:\Users\sgg91044\Desktop\MEP_no_defect_data_pivot.csv')
print('Testing data shape: ', app_test.shape)
app_test.head(20000)

# In[4]:

app_test['Target'].value_counts()
app_test['Target'].astype(int).plot.hist();

# In[5]:

# Function to calculate missing values by column# Funct
def missing_values_table(app_test):
# Total missing values
mis_val = app_test.isnull().sum()

# Percentage of missing values
mis_val_percent = 100 * app_test.isnull().sum() / len(app_test)

# Make a table with the results
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

# Rename the columns
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : 'Missing Values', 1 : '% of Total Values'})

# Sort the table by percentage of missing descending
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)

# Print some summary information
print ("Your selected dataframe has " + str(app_test.shape[1]) + " columns.\n"
"There are " + str(mis_val_table_ren_columns.shape[0]) + " columns that have missing values.")

# Return the dataframe with missing information
return mis_val_table_ren_columns

# In[6]:

# Missing values statistics
missing_values = missing_values_table(app_test)
missing_values.head(20)

# In[7]:

RR13_MAX_median = app_test['RR13_MAX.'].median()
ETCM_PHC4_median = app_test['ETCM_PHC4'].median()
HELK_MEAN_median = app_test['HELK_MEAN'].median()
PBK4_median = app_test['PBK4'].median()
ETCM_PHB4_median = app_test['ETCM_PHB4'].median()
ETCM_PHA4_median = app_test['ETCM_PHA4'].median()
THR3_MAX_median = app_test['THR3_MAX.'].median()
THR3_MEAN_median = app_test['THR3_MEAN'].median()
RR23_MEAN_median = app_test['RR23_MEAN'].median()
RR13_MEAN_median = app_test['RR13_MEAN'].median()
THR3_MEAN_DIFF_median = app_test['THR3_MEAN_DIFF'].median()
THR3_MEAN_SLOPE_median = app_test['THR3_MEAN_SLOPE'].median()
THR3_MEAN_SLOPE_median = app_test['THR3_MEAN_SLOPE'].median()
THR3_MAX_DIFF_median = app_test['THR3_MAX._DIFF'].median()
LOWERCHM_PRESS_DIFF_median = app_test['LOWERCHM_PRESS'].median()
HELK_MAX_median = app_test['HELK_MAX.'].median()
#HELK_MIN_median = app_test['HELK_MIN.'].median()
HELK_SD_median = app_test['HELK_SD'].median()
THR3_SD_median = app_test['THR3_SD'].median()
RR23_MAX_median = app_test['RR23_MAX.'].median()
#RR13_MAX_median
#ETCM_PHC4_median
#HELK_MEAN_median
#PBK4_median
#ETCM_PHB4_median
#ETCM_PHA4_median
#THR3_MAX_median
#THR3_MEAN_median
#RR23_MEAN_median
#RR13_MEAN_median
#THR3_MEAN_DIFF_median
#THR3_MEAN_SLOPE_median
#THR3_MEAN_SLOPE_median
#THR3_MAX_DIFF_median
#LOWERCHM_PRESS_DIFF_median
#HELK_MAX_median
#HELK_MIN_median
#HELK_SD_median
#THR3_SD_median

# In[8]:

app_test=app_test.fillna({'RR13_MAX.':RR13_MAX_median,
'ETCM_PHC4':ETCM_PHC4_median,
'HELK_MEAN':HELK_MEAN_median,
'PBK4':PBK4_median,
'ETCM_PHB4':ETCM_PHB4_median,
'ETCM_PHA4':ETCM_PHA4_median,
'THR3_MAX.':THR3_MAX_median,
'THR3_MEAN':THR3_MEAN_median,
'RR23_MEAN':RR23_MEAN_median,
'RR13_MEAN':RR13_MEAN_median,
'THR3_MEAN_DIFF':THR3_MEAN_DIFF_median,
'THR3_MEAN_SLOPE':THR3_MEAN_SLOPE_median,
'THR3_MEAN_SLOPE':THR3_MEAN_SLOPE_median,
'THR3_MAX._DIFF':THR3_MAX_DIFF_median,
'LOWERCHM_PRESS':LOWERCHM_PRESS_DIFF_median,
'HELK_MAX.':HELK_MAX_median,

'HELK_SD':HELK_SD_median,
'THR3_SD':THR3_SD_median,
'RR23_MAX.':RR23_MAX_median
})
app_test

# In[9]:

# Find correlations with the target and sort
correlations = app_test.corr()['Target'].sort_values()

# Display correlations
print('Most Positive Correlations:\n', correlations.tail(20))
print('\nMost Negative Correlations:\n', correlations.head(20))

# In[10]:

# Measure the correlation of parameter'ETCM_PHC4' and target

plt.figure(figsize = (20, 8))

# KDE plot of wafers that were non-defected
sns.kdeplot(app_test.loc[app_test['Target'] == 0, 'ETCM_PHC4'],label = 'target == 0')
# KDE plot of wafers that were defected
sns.kdeplot(app_test.loc[app_test['Target'] == 1, 'ETCM_PHC4'],label = 'target == 1')

# Labeling of plot
plt.xlabel('ETCM_PHC4'); plt.ylabel('Density'); plt.title('Distribution of ETCM_PHC4');

# In[11]:

sns.boxplot(x='HELK_SD', y='Target', data=app_test)

sns.plot.show()

# In[ ]:

# Scatter Plot
app_test.plot(kind='scatter', x='PBK4', y='Target',alpha = 0.5,color = 'red')
plt.xlabel('PBK4') # label = name of label
plt.ylabel('Target')
plt.title('PBK4 Target Scatter Plot')

# In[ ]:

# Measure the correlation of parameter'HELK_MEAN' and target

plt.figure(figsize = (20, 8))

# KDE plot of wafers that were non-defected
sns.kdeplot(app_test.loc[app_test['Target'] == 0, 'HELK_MEAN'],label = 'target == 0')
# KDE plot of wafers that were defected
sns.kdeplot(app_test.loc[app_test['Target'] == 1, 'HELK_MEAN'],label = 'target == 1')

# Labeling of plot
plt.xlabel('HELK_MEAN'); plt.ylabel('Density'); plt.title('Distribution of HELK_MEAN');

# In[ ]:

plt.figure(figsize = (15, 50))
# iterate through the new features
for i, feature in enumerate(['LOWERCHM_PRESS',
'RR13_MEAN',
'RR13_MAX.',
'RR23_MEAN',
'THR3_MAX.',
'THR3_MEAN',
'RR23_MAX.',
'PBK4',
'THR3_MEAN_DIFF',
'HELK_MEAN',
'THR3_MEAN_SLOPE',
'THR3_MAX._DIFF']):

# create a new subplot for each source
plt.subplot(13, 1, i + 1)
# plot non_defected wafer
sns.kdeplot(app_test.loc[app_test['Target'] == 0, feature], label = 'Target == 0')
# plot defected wafer
sns.kdeplot(app_test.loc[app_test['Target'] == 1, feature], label = 'Target == 1')

# Label the plots
plt.title('Distribution of %s by Target Value' % feature)
plt.xlabel('%s' % feature); plt.ylabel('Density');

plt.tight_layout(h_pad = 0.1)

# In[ ]:

# Bin the parameter'ETCM_PHC4' data
ETCM_PHC4 = app_test[['Target', 'ETCM_PHC4']]
ETCM_PHC4['VALUE_BINNED'] = pd.cut(ETCM_PHC4['ETCM_PHC4'], bins = np.linspace(200, 1000, num = 17))
ETCM_PHC4.head(20)

# In[ ]:

# Group by the bin and calculate averages
ETCM_PHC4_groups = ETCM_PHC4.groupby('VALUE_BINNED').mean()
ETCM_PHC4_groups

# In[ ]:

plt.figure(figsize = (8, 8))

# Graph the age bins and the average of the target as a bar plot
plt.bar(ETCM_PHC4_groups.index.astype(str), 100 * ETCM_PHC4_groups['Target'])

# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('ETCM_PHC4 value groups'); plt.ylabel('Possibility to be defected (%)')
plt.title('Possibility to be defected by ETCM_PHC4');

# In[ ]:

# Bin the parameter'HELK_MEAN' data
HELK_MEAN = app_test[['Target', 'HELK_MEAN']]
HELK_MEAN['VALUE_BINNED'] = pd.cut(HELK_MEAN['HELK_MEAN'], bins = np.linspace(0, 17.5, num = 8))
plt.figure(figsize = (8, 8))

# Group by the bin and calculate averages
HELK_MEAN_groups = HELK_MEAN.groupby('VALUE_BINNED').mean()
# Graph the age bins and the average of the target as a bar plot
plt.bar(HELK_MEAN_groups.index.astype(str), 100 * HELK_MEAN_groups['Target'])

# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('HELK_MEAN value groups'); plt.ylabel('Possibility to be defected (%)')
plt.title('Possibility to be defected by HELK_MEAN');

# In[13]:

# Extract the parameters variables and show correlations
ext_data = app_test[['Target', 'RR13_MAX.',
'ETCM_PHC4' ,
'HELK_MEAN',
'PBK4',
'ETCM_PHB4' ,
'ETCM_PHA4',
'HELK_MAX.',
'HELK_SD',
'THR3_SD' ]]
ext_data_corrs = ext_data.corr()
ext_data_corrs

# In[14]:

plt.figure(figsize = (10, 10))

# Heatmap of correlations
sns.heatmap(ext_data_corrs, cmap = plt.cm.RdYlBu_r, vmin =0, annot = True, vmax = 1)
plt.title('Correlation Heatmap');

# In[ ]:

plt.figure(figsize = (10, 12))

# iterate through the sources
for i, source in enumerate(['ETCM_PHA4', 'ETCM_PHB4', 'ETCM_PHC4']):

# create a new subplot for each source
plt.subplot(3, 1, i + 1)
# plot non-defected
sns.kdeplot(app_test.loc[app_test['Target'] == 0, source], label = 'target == 0',shade=True)
# plot defected
sns.kdeplot(app_test.loc[app_test['Target'] == 1, source], label = 'target == 1',shade=True)

# Label the plots
plt.title('Distribution of %s by Target Value' % source)
plt.xlabel('%s' % source); plt.ylabel('Density');

plt.tight_layout(h_pad = 2.5)

# In[ ]:

# Copy the data for plotting
plot_data = ext_data.drop(columns = ['RR13_MAX.',
'HELK_MAX.',
'HELK_SD',
'THR3_SD' ]).copy()

# Function to calculate correlation coefficient between two columns
def corr_func(x, y, **kwargs):
r = np.corrcoef(x, y)[0][1]
ax = plt.gca()
ax.annotate("r = {:.2f}".format(r),
xy=(.2, .8), xycoords=ax.transAxes,
size = 20)

# Create the pairgrid object
grid = sns.PairGrid(data = plot_data, size = 3, diag_sharey=False,
hue = 'Target',
vars = [x for x in list(plot_data.columns) if x != 'Target'])

# Upper is a scatter plot
grid.map_upper(plt.scatter, alpha = 0.2)

# Diagonal is a histogram
grid.map_diag(sns.kdeplot)

# Bottom is density plot
grid.map_lower(sns.kdeplot, cmap = plt.cm.OrRd_r);

plt.suptitle('Ext Source Features Pairs Plot', size = 32, y = 1.05);

# In[12]:

import seaborn as sns
# Create the default pairplot
sns.pairplot(app_test)

# In[224]:

app_test.to_csv(r'D:\Users\sgg91044\Downloads\SEDA\CTM_data.csv', index=True, header=True)

# In[243]:

from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split

CTM_data=pd.read_csv(r'D:\Users\sgg91044\Downloads\SEDA\CTM_data.csv')
# Median imputation of missing values
#RR23_MAX_median = CTM_data['RR23_MAX.'].median()
#CTM_data=CTM_data.fillna({'RR23_MAX.': RR23_MAX_median})

# Drop the target from the training data
#CTM_data['Target_C']=CTM_data['Target'].astype('category')
#CTM_data['Target_C'].cat.categories=['noMEP','MEP']
#le = LabelEncoder()
#le_count = 0
#le.fit(CTM_data['Target_C'])
#CTM_data['Target_C'] = le.transform(CTM_data['Target_C'])
CTM_data['Target']=CTM_data['Target'].astype('float')
CTM_data_Target= CTM_data[['Target']]
CTM_data_Columns = CTM_data.drop(columns = ['Target'])
X_train, X_test, y_train, y_test = train_test_split(CTM_data_Columns,CTM_data_Target,test_size=0.3, random_state=0)

# In[244]:

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

# In[245]:

print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

# In[228]:

#Y_test = test[['Target_C']]
#Y_train = test.drop(columns = ['Target','Target_C'])

# Feature names
#features = list(train.columns)
#y, _ = pd.factorize(app_train['Target'])

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
#imputer.fit(train)

# Transform both training and testing data
#train = imputer.transform(train)
#test = imputer.transform(test)

# Repeat with the scaler
scaler.fit(X_train)
X_train = scaler.transform(X_train)
scaler.fit(X_test)
X_test = scaler.transform(X_test)
print('Training data shape: ', X_train.shape)
print('Testing data shape: ', X_test.shape)

# In[229]:

from sklearn.ensemble import RandomForestClassifier

# Make the random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)

# In[203]:

y_test

# In[234]:

# Train on the training data
random_forest.fit(X_train,y_train)

# Extract feature importances
feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': features, 'importance': feature_importance_values})

# Make predictions on the test data
predictions = random_forest.predict(X_test)

# In[232]:

predictions
predictions.shape

# In[233]:

import pandas as pd
# 第一个参数是行索引,第二个属性为列索引
print( pd.crosstab(y_test['Target'], predictions, rownames=['Target'], colnames=['preds']))
# Make a submission dataframe

# In[165]:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split,cross_val_score

# In[190]:

cross_val_score(random_forest,y_test,submit )

# In[235]:

def plot_feature_importances(df):
"""
Plot importances returned by a model. This can work with any measure of
feature importance provided that higher importance is better.

Args:
df (dataframe): feature importances. Must have the features in a column
called `features` and the importances in a column called `importance

Returns:
shows a plot of the 15 most importance features

df (dataframe): feature importances sorted by importance (highest to lowest)
with a column for normalized importance
"""

# Sort features according to importance
df = df.sort_values('importance', ascending = False).reset_index()

# Normalize the feature importances to add up to one
df['importance_normalized'] = df['importance'] / df['importance'].sum()

# Make a horizontal bar chart of feature importances
plt.figure(figsize = (10, 6))
ax = plt.subplot()

# Need to reverse the index to plot most important on top
ax.barh(list(reversed(list(df.index[:20]))),
df['importance_normalized'].head(20),
align = 'center', edgecolor = 'k')

# Set the yticks and labels
ax.set_yticks(list(reversed(list(df.index[:20]))))
ax.set_yticklabels(df['feature'].head(20))

# Plot labeling
plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
plt.show()

return df

# In[236]:

feature_importances_sorted = plot_feature_importances(feature_importances)

我的代码-statistic analysis的更多相关文章

  1. Web 检测代码 web analysis 开源 open source

    1. Grape Web Statistics Grape Web Statistics is a fairly simple piece of analytics software. Grape i ...

  2. Android内存等信息

    1. Linux中proc目录下文件详解 http://wenku.baidu.com/view/2ce89f00a6c30c2259019ef1.html 2. Android系统/proc目录详解 ...

  3. 结构-行为-样式-angularJs 指令实现滚动文字

    最近在做XX项目的大屏展示页面,有一个表格需要用到这个滚动效果,于是就写了个指令,记录下,共同学习. Html代码: <td word-roll tword="item"&g ...

  4. 以太坊智能合约虚拟机(EVM)原理与实现

    以太坊 EVM原理与实现 以太坊底层通过EVM模块支持合约的执行与调用,调用时根据合约地址获取到代码,生成环境后载入到EVM中运行.通常智能合约的开发流程是用solidlity编写逻辑代码,再通过编译 ...

  5. 20165227 2017-2018-2《Java程序设计》课程总结

    20165227 2017-2018-2<Java程序设计>课程总结 每周作业链接汇总 预备作业1 简要内容: 记忆深刻的老师 我期望的师生关系 对于Java学习的看法 预备作业2 简要内 ...

  6. C++解析头文件-Qt自动生成信号定义

    目录 一.概述 二.实现思路 三.代码讲解 1.类图 2.QtCppDescription 3.测试 四.源代码 一.概述 上一篇文章C++解析头文件-Qt自动生成信号声明我们主要讲解了怎么去解析C+ ...

  7. 344. Reverse String【easy】

    344. Reverse String[easy] Write a function that takes a string as input and returns the string rever ...

  8. 手记系列之二 ----- 关于IDEA的一些使用方法经验

    前言 本篇文章主要介绍的关于本人在使用IDEA的一些使用方法,一些常用设置,一些插件推荐和使用.请注意,本文特长,2w多字加上几十张图片,建议收藏观看~ 前提准备 idea官网: https://ww ...

  9. 数据关联分析 association analysis (Aprior算法,python代码)

    1基本概念 购物篮事务(market basket transaction),如下表,表中每一行对应一个事务,包含唯一标识TID,和购买的商品集合.本文介绍一种成为关联分析(association a ...

随机推荐

  1. Python_Mix*re模块基础方法,进阶,正则表达式的使用

    re模块import re 基础方法 findall:匹配所有 ,每一项都是列表中的一个元素,返回的是列表 search只匹配从左到右的第一个,得到的不是直接的结果,而是一个变量,通过这个变量的gro ...

  2. bzoj4700

    题解: cdq分治 先考虑没有人被秒掉的情况 代码: #include<bits/stdc++.h> #define y1 ____y1 ; using namespace std; ty ...

  3. Python编写的记事本小程序

    用Python中的Tkinter模块写的一个简单的记事本程序,Python2.x和Python3.x的许多内置函数有所改变,所以以下分为Python2.x和Python3.x版本. 一.效果展示: 二 ...

  4. Python-接口自动化(七)

    requests模块(七) (八)requests模块 1.requests是用python语言编写,属于第三方库,基于urllib,采用Apache2 Licensed开源协议的HTTP库,它比ur ...

  5. MyBatis工具类

    package cn.word.util; import java.io.IOException;import java.io.InputStream;import java.util.Enumera ...

  6. 浅谈java 之 Map

    先来一张Map的类继承图 Map :Hashtable .HashMap .LinkedHashMap .TreeMap 的比较   1.Hashtable的方法实现了synchronized 是线程 ...

  7. ecplise中设置字体大小和背景

    1 将ecplise中的代码背景设置为豆沙色 2 设置ecplise中的字体大小

  8. 干货 | SSMS客户端连接京东云RDS SQL Server配置方法

    干货 | SSMS客户端连接京东云RDS SQL Server配置方法 原创: 于振江 京东云开发者社区  微软SQL Server, Oracle数据库以及MySQL系列占据了关系型数据库市场的绝对 ...

  9. linux发布环境初始化脚本

    #参数配置 homeDir=$(pwd) tomcatDir=$homeDir/tomcat logDir=$homeDir/tomcat/logs backUpDir=$homeDir/backup ...

  10. swoole架构分析

    swoole的进程/线程结构 结构图如下: swoole主要由Master进程(主进程)和Manager进程配合使用完成其功能. Master进程 是一个多线程的程序.其中有一组很重要的线程,称之为R ...