01.Series

 # -*- coding: utf-8 -*-
 """
 Series 객체 특징
  - pandas 제공 1차원 자료구성
  - DataFrame 칼럼 구성요소
  - 수학/통계 관련 함수 제공
  - 범위 수정, 블럭 연산
  - indexing/slicing(list 동일)
  - 시계열 데이터 처리
 """

 import pandas as pd #pd.Series()
 from pandas import Series

 #1.Series 생성

 #1) List 이용
 list=[4000,3000,2000,3500]
 print(list*2) #[4000, 3000, 2000, 3500, 4000, 3000, 2000, 3500]

 price=Series([4000,3000,2000,3500])
 print(price*2)
 """
 0    8000
 1    6000
 2    4000
 3    7000
 dtype: int64
 """
 print("index=",price.index)#index index= RangeIndex(start=0, stop=4, step=1)
 print("value=",price.values)#data value= [4000 3000 2000 3500]
 print(list[0],price[0]) #4000 4000

 # 2) dic 이용 key=index : value=values
 person=pd.Series({'name':'홍길동','age':35,'addr':'서울시'})
 print(person)
 """
 addr    서울시
 age      35
 name    홍길동
 dtype: object
 """

 # 2. indexing(list와 동일)
 ser_data=pd.Series([4,4.5,6,8,10.5])

 print(ser_data[0]) #4.0
 print(ser_data[:3])
 """
 0    4.0
 1    4.5
 2    6.0
 dtype: float64
 """
 print(ser_data[3:])
 """
 3     8.0
 4    10.5
 dtype: float64
 """
 print(ser_data[:])
 """
 0     4.0
 1     4.5
 2     6.0
 3     8.0
 4    10.5
 dtype: float64
 """
 #print(ser_data[-1]) # - 사용할수 없다

 # Boolean 조건
 print(ser_data[ser_data>=5])
 """
 2     6.0
 3     8.0
 4    10.5
 dtype: float64
 """

 # 3. Series 결합, NA 처리
 data1=Series([4000,None,3000,2000],
              index=['a','m','o','k'])

 data2=Series([4000,3000,3500,2000],
              index=['a','o','k','m'])

 #join :index 기준
 resualt=data1+data2 # 블럭 연산
 print(resualt)
 """
 a    8000.0
 k    5500.0
 m       NaN 숫자+None=None
 o    6000.0
 dtype: float64
 """
 print(type(resualt))#  Series' <class 'pandas.core.series.Series'>

 #NA 처리 :0,평균 대체
 result2=resualt.fillna(0)#0 대체
 print(result2)
 """
 a    8000.0
 k    5500.0
 m       0.0
 o    6000.0
 dtype: float64
 """

 result3=resualt.fillna(resualt.mean())#평균 대체
 print(result3)
 """
 a    8000.0
 k    5500.0
 m    6500.0
 o    6000.0
 dtype: float64
 """

 print(pd.notnull(resualt))
 """
 a     True
 k     True
 m    False
 o     True
 dtype: bool
 """
 # 결측치를 제외한 subset 생성
 subset=resualt[pd.notnull(resualt)]
 print(subset)
 """
 a    8000.0
 k    5500.0
 o    6000.0
 dtype: float64
 """

 #4.Series 연산
 print(ser_data)
 """
 0     4.0
 1     4.5
 2     6.0
 3     8.0
 4    10.5
 dtype: float64
 """
 #10블럭수정
 ser_data[1:4]=50
 print(ser_data)
 """
 0     4.0
 1    50.0
 2    50.0
 3    50.0
 4    10.5
 dtype: float64
 """

 #2)수학 통계 함수
 print(ser_data.sum())#164.5
 print(ser_data.mean())#32.9
 print(ser_data.max())#50.0
 print(ser_data.min())#4.0

 #3) broadcast 연산
 print(ser_data * 0.5) #vector(1) * scala(0)
 """
 0     2.00
 1    25.00
 2    25.00
 3    25.00
 4     5.25
 dtype: float64
 """

02.DataFrame

 # -*- coding: utf-8 -*-
 """
 Created on Sat Feb  9 12:34:12 2019

 @author: 502-03
 DataFrame 객체 특징
  - Pandas제공 2차원 행렬구조 (table 구조 동일)
  - 칼럼 단위 상이한 자료형 제공
  - DataFrame 구성요소
      -> Series : 1 차원 (vector)

 """

 import pandas as pd
 from pandas import DataFrame

 #1.DataFrame 생성
 name=['홍길동','이순신','강감찬','유관순']
 age=[35,45,55,25]
 pay=[350,450,550,250]
 emp=pd.DataFrame({'name':name,'age':age,'pay':pay},
                  columns=['name','age','pay'])
 print(emp)
 """
   name  age  pay
 0  홍길동   35  350
 1  이순신   45  450
 2  강감찬   55  550
 3  유관순   25  250
 """

 #1) Series 객체 이용: colum추가
 gender = pd.Series(['M','M','M','F'])
 emp['gender']=gender
 print(emp)
 """
   name  age  pay gender
 0  홍길동   35  350      M
 1  이순신   45  450      M
 2  강감찬   55  550      M
 3  유관순   25  250      F
 """

 #2) Numpy 객체 이용
 import numpy as np
 frame = pd.DataFrame(np.arange(12).reshape(3,4),
                      columns=['a','b','c','d'])
 print(frame)
 """
    a  b   c   d
 0  0  1   2   3
 1  4  5   6   7
 2  8  9  10  11
 """

 # 행/열 통계 구하기
 print(frame.mean()) #열단위 평균
 """
 a    4.0
 b    5.0
 c    6.0
 d    7.0
 dtype: float64
 """
 print(frame.mean(axis=0)) #열단위 평균
 """
 a    4.0
 b    5.0
 c    6.0
 d    7.0
 dtype: float64
 """
 print(frame.mean(axis=1)) #행단위 평균
 """
 0    1.5
 1    5.5
 2    9.5
 dtype: float64
 """

 # 2. index 지정
 print(frame.index)#RangeIndex(start=0, stop=3, step=1)
 print(frame.values)
 """
 [[ 0  1  2  3]
  [ 4  5  6  7]
  [ 8  9 10 11]]
 """
 print(frame.columns)
 """
 Index(['a', 'b', 'c', 'd'], dtype='object')
 """
 # 1) 특정 칼럼(a)로 index 지정
 set_index=frame.set_index('a')
 print(set_index)
 """
    b   c   d
 a
 0  1   2   3
 4  5   6   7
 8  9  10  11
 """
 # 2)index재 지정
 reset_index=set_index.reset_index();
 print(reset_index)
 """
    a  b   c   d
 0  0  1   2   3
 1  4  5   6   7
 2  8  9  10  11
 """

 #3 DF칼럼 참조

 #1)단일 칼럼 참조
 a_col1=frame.a  #DF.colum
 a_col2=frame['a'] #DF['colum']

 print(a_col1)
 """
 0    0
 1    4
 2    8
 Name: a, dtype: int32
 """
 print(a_col2)
 """
 0    0
 1    4
 2    8
 Name: a, dtype: int32
 """

 a_col2=frame['a'][2] #DF['colum'][index]

 #2) 복수 칼럼 참조
 print(frame[['a','c']]) # [['a':'c']](x)
 """
    a   c
 0  0   2
 1  4   6
 2  8  10
 """

 cols=['a','b']
 frame[cols]

 # 4.Make Subset
 #2)특정칼럼 제외
 print('subset1')
 subset_df=frame[['a','c','d']]
 print(subset_df)
 """
    a   c   d
 0  0   2   3
 1  4   6   7
 2  8  10  11
 """

 #20특정행 제외
 print('drop')
 print(frame.drop(0)) #1행 제거
 """
    a  b   c   d
 1  4  5   6   7
 2  8  9  10  11
 """
 print(frame.drop(1)) #2행 제거
 """
    a  b   c   d
 0  0  1   2   3
 2  8  9  10  11
 """

 #해당원소가 제외된 새로운 make new object
 a_col=frame['a'] #DF(2)->vector(1)
 print(type(a_col))#<class 'pandas.core.series.Series'>

 #a칼럼 기준으로 행 삭제
 sunbset_df2=frame #df 볻제
 print(sunbset_df2)
 """
    a  b   c   d
 0  0  1   2   3
 1  4  5   6   7
 2  8  9  10  11
 """
 for i,c in enumerate(a_col):
     print('i=',i,'c=',c)
     if c < 5 :
         sunbset_df2=sunbset_df2.drop(i)
 """
 i= 0 c= 0
 i= 1 c= 4
 i= 2 c= 8
 """

 print(sunbset_df2)
 """
    a  b   c   d
 2  8  9  10  11
 """

 #3)칼럼 많은 경우
 iris=pd.read_csv("../data/iris.csv")
 print(iris.info())
 '''
 <class 'pandas.core.frame.DataFrame'>
 RangeIndex: 150 entries, 0 to 149
 Data columns (total 5 columns):
 Sepal.Length    150 non-null float64
 Sepal.Width     150 non-null float64
 Petal.Length    150 non-null float64
 Petal.Width     150 non-null float64
 Species         150 non-null object
 dtypes: float64(4), object(1)
 memory usage: 5.9+ KB
 None
 '''
 print(type(iris)) # DataFrame
 print(iris.columns)
 """
 Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
        'Species'],
       dtype='object')
 """
 #cols = list(iris.columns) # 칼럼명 추출
 cols=iris.columns.tolist() #python 3.6 用这个
 print(cols)
 '''
 ['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species']
 '''
 print(iris[cols[0]]) #첫번째 칼럼
 """
 0      5.1
 1      4.9
 2      4.7
 3      4.6
 ...
 145    6.7
 146    6.3
 147    6.5
 148    6.2
 149    5.9
 """
 print(iris[cols[-1]])# 마지막 칼럼
 """
 0         setosa
 1         setosa
 2         setosa
 3         setosa
 4         setosa
 ...
 146    virginica
 147    virginica
 148    virginica
 149    virginica
 """

 #1~3칼럼 참조
 print(iris[['Sepal.Length', 'Sepal.Width', 'Petal.Length']])
 """
 0             5.1          3.5           1.4
 1             4.9          3.0           1.4
 2             4.7          3.2           1.3
 3             4.6          3.1           1.5
 4             5.0          3.6           1.4
 5             5.4          3.9           1.7
 ...
 146           6.3          2.5           5.0
 147           6.5          3.0           5.2
 148           6.2          3.4           5.4
 149           5.9          3.0           5.1
 """
 print(iris[cols[:3]]) #권장
 """
 0             5.1          3.5           1.4
 1             4.9          3.0           1.4
 2             4.7          3.2           1.3
 3             4.6          3.1           1.5
 ...
 146           6.3          2.5           5.0
 147           6.5          3.0           5.2
 148           6.2          3.4           5.4
 149           5.9          3.0           5.1
 """

 print(iris.head())
 """
 [150 rows x 3 columns]
    Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
 0           5.1          3.5           1.4          0.2  setosa
 1           4.9          3.0           1.4          0.2  setosa
 2           4.7          3.2           1.3          0.2  setosa
 3           4.6          3.1           1.5          0.2  setosa
 4           5.0          3.6           1.4          0.2  setosa
 """

 #1~4칼럼 :x,5칼럼 :y
 iris_x=iris[cols[:4]]
 iris_y=iris[cols[-1]]

 print(iris_x.shape)#(150, 4)  2차원
 print(iris_y.shape)#(150,)   1차원

 #5.DF 행렬 참조 : R  [row,col1:col3]
 '''
 DF.ix[row index or lable,col index or lable]
  - DF 대상으로 행과 열의 index(숫자) or lable(문자) 참조
  - 연속 데이터는 (:) 사용가능
  - lable이 숫자면 lable-based 참조
 '''
 #DF.ix[row,col]
 print('frame')
 print(frame)
 '''
    a  b   c   d
 0  0  1   2   3
 1  4  5   6   7
 2  8  9  10  11
 '''
 print(frame.ix[1]) #행 default
 '''
 a    4
 b    5
 c    6
 d    7
 Name: 1, dtype: int32
 '''
 print(frame.ix[1,2]) #2행3열 6
 print(frame.ix[:,'d']) #d열 전체
 '''
 0     3
 1     7
 2    11
 Name: d, dtype: int32
 '''
 print(frame.ix[:,'b':'c']) #b~b열 전체
 """
    b   c
 0  1   2
 1  5   6
 2  9  10
 """

 print(len(iris)) #관측치 길이  -150

 import numpy as np
 idx=np.random.choice(10,5,replace=False)
 print(idx)# [3 4 9 0 6]

 idx=np.random.choice(len(iris),int(len(iris)*0.7),
                      replace=False)
 print(idx,len(idx))
 """
 [  9  75   1 138  16  24  35  90  68  73  48 147  46  80  74  89 124  94
   83   0 134  71 142   3  91  34  86  15 143  85 103  30  97  93 109 104
  125  45  69  79  49  87 108 127 139   8  33  99  37 148  18  23  41  11
  117  60 107  43  50  58 149 136 100 120  92   6  77  76  84  88  47  95
   25  72  29 118 106 141  17  32   5  26 132 112  31   2  52  19  51  98
  144 128  27  21 121  14  63 122  20  66 145  78   4  81  44] 105
 """

 train_set=iris.ix[idx,:]
 print(train_set.shape)#(105, 5)

03.Descriptive

 # -*- coding: utf-8 -*-
 """
 1. DataFrame 요약통계량
 2. 변수 간의 상관성 분석
 """

 import pandas as pd

 product = pd.read_csv('../data/product.csv')
 print(product.info())

 # 기술통계량 구하기
 summary = product.describe()
 print(summary)

 # 행/열 통계량 구하기  : axis=0 or 1
 print(product.sum(axis = 0)) # 열 합계
 '''
 a    773
 b    827
 c    817
 '''
 print(product.sum(axis = 1)) # 행 합계 

 # 산포도
 print(product.var()) # 분산
 print(product.std()) # 표준편차 

 # 빈도수
 a_cnt = product['a'].value_counts()
 print(a_cnt)
 '''
 3    126
 4     64
 2     37
 1     30
 5      7
 '''

 # 중복 제외
 b_uni = product['b'].unique()
 print(b_uni) # [4 3 2 5 1]

 # 변수 간의 상관분석( -1 < r < 1)
 p_corr = product.corr()
 print(p_corr)
 '''
           a         b         c
 a  1.000000  0.499209  0.467145
 b  0.499209  1.000000  0.766853
 c  0.467145  0.766853  1.000000
 '''

 ac_corr = product['a'].corr(product['c'])
 print(ac_corr) # 0.4671449836008965

 #문) iris 1 ~ 4 칼럼 -> 상관분석(r)
 cols = list(iris.columns)
 print(cols) # 5개 칼럼 list
 iris_sub = iris[cols[:4]]

 print(iris_sub.corr())

04.merge

 # -*- coding: utf-8 -*-
 """
 DataFrame marge
 """

 import pandas as pd

 wdbc = pd.read_csv("../data/wdbc_data.csv")
 print(wdbc.info())
 '''
 RangeIndex: 569 entries, 0 to 568
 Data columns (total 32 columns):
 '''

 cols = list(wdbc.columns)
 print(cols)

 df1 = wdbc[cols[:16]] # 1~16
 sid = wdbc['id'] # id 칼럼
 df2 = wdbc[cols[16:]] # 17~32

 df2['id'] = sid

 print(df1.shape) # (569, 16)
 print(df2.shape) # (569, 17)

 # 1. id 칼럼으로 DF 병합
 df_merge = pd.merge(df1, df2) # id 칼럼, how='inner'
 print(df_merge.info())
 '''
 <class 'pandas.core.frame.DataFrame'>
 Int64Index: 569 entries, 0 to 568
 Data columns (total 32 columns):
 '''

 # 2. 칼럼 단위 df 붙이기
 df1 = wdbc[cols[:16]] # 1~16
 df2 = wdbc[cols[16:]] # 17~32

 df_merge2 = pd.concat([df1, df2], axis=1) # 열 단위 결합
 print(df_merge2.info())
 '''
 <class 'pandas.core.frame.DataFrame'>
 RangeIndex: 569 entries, 0 to 568
 Data columns (total 32 columns):
 '''

05.timeSeries

 # -*- coding: utf-8 -*-
 """
 시계열 데이터 시각화
  1. 날짜형식 수정(다국어 -> 한국어)
  2. 시계열 시각화
  3. 이동평균 기능
 """

 import pandas as pd
 from datetime import datetime # 날짜형식 수정 

 cospi = pd.read_csv("../data/cospi.csv")
 print(cospi.info())
 '''
 RangeIndex: 247 entries, 0 to 246
 Data columns (total 6 columns):
 Date      247 non-null object
 Open      247 non-null int64
 High      247 non-null int64
 Low       247 non-null int64
 Close     247 non-null int64
 Volume    247 non-null int64
 '''

 print(cospi.head())
 # 0  26-Feb-16  1180000  1187000  1172000  1172000  176906
 # 26-Feb-16 -> 2016-2-26

 # 1. 날짜형식 수정(다국어 -> 한국식)
 Date = cospi['Date'] # cospi.Date
 kDate = [] # 빈list

 for d in Date :
     kDate.append(datetime.strptime(d, "%d-%b-%y"))

 print(kDate[:10])

 cospi['Date'] = kDate # (다국어 -> 한국식)
 print(cospi.head())

 # 2. 시계열 시각화
 import matplotlib.pyplot as plt

 # 1개 칼럼 추세그래프
 cospi['High'].plot(title = "Trend line of High column")
 plt.show()

 # 2개 칼럼 추세그래프
 cospi[['High', 'Low']].plot(title = "Trend line of High vs Low")
 plt.show()

 # 2. index 수정
 print(cospi.index)
 # RangeIndex(start=0, stop=247, step=1)

 # index 수정 -> Date 칼럼
 new_cospi = cospi.set_index('Date')
 print(new_cospi.head())

 # 년도별 검색
 '])
 '])

 # 월별 검색
 print(new_cospi['2016-02'])
 # 범위 검색
 print(new_cospi['2016-02':'2016-01'])

 new_cospi_HL = new_cospi[['High', 'Low']]
 new_cospi_HL['].plot(title="title")
 plt.show()

 new_cospi_HL['2016-02'].plot(title="title")
 plt.show()

 # 3. 이동평균 기능 

 # 5일, 10일, 20일
 roll_mean5 = pd.Series.rolling(new_cospi.High,
                   window=5, center=False).mean()
 print(roll_mean5)

 roll_mean10 = pd.Series.rolling(new_cospi.High,
                   window=10, center=False).mean()

 roll_mean20 = pd.Series.rolling(new_cospi.High,
                   window=20, center=False).mean()

 # roll mean 시각화
 new_cospi.High.plot(color='orange', label='High column')
 roll_mean5.plot(color='red', label='5day rolling mean')
 roll_mean10.plot(color='green', label='10day rolling mean')
 roll_mean20.plot(color='blue', label='20day rolling mean')
 plt.legend(loc='best')
 plt.show()

01.pandas的更多相关文章

  1. python--Numpy and Pandas 笔记01

    博客地址:http://www.cnblogs.com/yudanqu/ 1 import numpy as np import pandas as pd from pandas import Ser ...

  2. 使用pandas进行数据预处理01

    数据预处理有四种技术:数据合并,数据清洗,数据标准化,以及数据转换. 数据合并技术:(1)横向或纵向堆叠合数据 (2)主键合并数据 (3)重叠合并数据 1.堆叠合并数据: 堆叠就是简单的把两个表拼接在 ...

  3. [Pandas] 01 - A guy based on NumPy

    主要搞明白NumPy“为什么快”. 学习资源 Panda 中文 易百教程 远程登录Jupyter笔记本 效率进化 四步效率优化 NumPy 底层进行了不错的优化. %timeit 对于任意语句,它会自 ...

  4. Pandas | 01 数据结构

    Pandas的三种数据结构: 系列(Series) 数据帧(DataFrame) 面板(Panel) 这些数据结构,构建在Numpy数组之上,这意味着它们很快 维数和描述 考虑这些数据结构的最好方法是 ...

  5. pandas基础-Python3

    未完 for examples: example 1: # Code based on Python 3.x # _*_ coding: utf-8 _*_ # __Author: "LEM ...

  6. 用scikit-learn和pandas学习Ridge回归

    本文将用一个例子来讲述怎么用scikit-learn和pandas来学习Ridge回归. 1. Ridge回归的损失函数 在我的另外一遍讲线性回归的文章中,对Ridge回归做了一些介绍,以及什么时候适 ...

  7. 用scikit-learn和pandas学习线性回归

    对于想深入了解线性回归的童鞋,这里给出一个完整的例子,详细学完这个例子,对用scikit-learn来运行线性回归,评估模型不会有什么问题了. 1. 获取数据,定义问题 没有数据,当然没法研究机器学习 ...

  8. 【Python实战】Pandas:让你像写SQL一样做数据分析(一)

    1. 引言 Pandas是一个开源的Python数据分析库.Pandas把结构化数据分为了三类: Series,1维序列,可视作为没有column名的.只有一个column的DataFrame: Da ...

  9. pandas 学习(1): pandas 数据结构之Series

    1. Series Series 是一个类数组的数据结构,同时带有标签(lable)或者说索引(index). 1.1 下边生成一个最简单的Series对象,因为没有给Series指定索引,所以此时会 ...

随机推荐

  1. python之property、类方法和静态方法

    一.完整的property1.定义一个方法被伪装成属性之后,应该可以执行一个属性的增删改查操作,增加和修改就对应着被setter装饰的方法,删除一个属性对应着被deleter装饰的方法. @prope ...

  2. Flask初识

    一.Flask初识 1.Flask介绍 Flask是一个使用 Python 编写的轻量级 Web 应用框架.其 WSGI 工具箱采用 Werkzeug服务 ,模板引擎则使用 Jinja2 .Flask ...

  3. Educational Codeforces Round 60 (Rated for Div. 2)

    A. Best Subsegment 题意 找 连续区间的平均值  满足最大情况下的最长长度 思路:就是看有几个连续的最大值 #include<bits/stdc++.h> using n ...

  4. [LOJ10121] 与众不同

    题目类型:\(DP\)+\(RMQ\) 传送门:>Here< 题意:给定一个长度为\(N\)的序列,并给出\(M\)次询问.询问区间\([L,R]\)内的最长完美序列.所谓完美序列就是指连 ...

  5. [模板]Min_25筛

    用途 快速($O(\frac{n^{3/4}}{logn})$)地计算一些函数f的前缀和,以及(作为中间结果的)只计算质数的前缀和 一般要求f(p)是积性函数,$f(p)$是多项式的形式,且$f(p^ ...

  6. A.02.00—功能定义与唤醒—起始

    第一章节主要讲的是模块普通的输入输出,精力及能力有限,仅介绍了一些较为普通的信号,另一些信号留待想了解的人自我探索. 第二章节打算介绍的是功能定义和休眠唤醒相关的内容.也是一些基础内容,对于比较少见或 ...

  7. Gym - 101350A Sherlock Bones(思维)

    The great dog detective Sherlock Bones is on the verge of a new discovery. But for this problem, he ...

  8. CMDB服务器管理系统【s5day92】:定制表头

    一.目录结构 二.获取数据,模板语言渲染 web\views.py import json from django.shortcuts import render,HttpResponse from ...

  9. Fiddler--Composer

    Composer选项卡支持手动构建和发请求:也可以在Session列表中拖拽Session放到Composer中,把该Session的请求复制到用户界面: 点击"execute"按 ...

  10. Elasticsearch6.3.2启动过程源码阅读记录

    Elasticsearch6.3.2启动过程源码阅读记录 网上有很多关于es的源码分析,觉得自己技术深度还不够,所以这些文章只是看源码过程中的一个笔记,谈不上分析. 整个启动过程以类名.方法名,按顺序 ...