二、pandas入门
import numpy as np
import pandas as pd
Series:
#创建Series方法1
s1=pd.Series([1,2,3,4])
s1
# 0 1
# 1 2
# 2 3
# 3 4
# dtype: int64
s1.values#array([1, 2, 3, 4], dtype=int64)
s1.index#RangeIndex(start=0, stop=4, step=1)
#创建Series方法2
s2=pd.Series(np.arange(5,10))
print(s2)
# 0 5
# 1 6
# 2 7
# 3 8
# 4 9
# dtype: int32
#创建Series方法3
s3=pd.Series({'5':1,'6':3,'7':9})
print(s3)
# 5 1
# 6 3
# 7 9
# dtype: int64
print(s3.index)#Index(['5', '6', '7'], dtype='object')
#创建Series方法4
s4=pd.Series([1,2,3,4],index=['A','B','C','D'])
print(s4)
# A 1
# B 2
# C 3
# D 4
# dtype: int64 #取值
print(s4['A'])#1
print(s4[s4>2])
# C 3
# D 4
# dtype: int64 #将Series转换成字典
dict=s4.to_dict()
print(dict)#{'A': 1, 'B': 2, 'C': 3, 'D': 4} #将字典转换为Series
seri=pd.Series(dict)
print(seri)
# A 1
# B 2
# C 3
# D 4
# dtype: int64 #改变Series的index
index_1=['z','A','B','v','C']
s5=pd.Series(s4,index=index_1)
print(s5)
# z NaN
# A 1.0
# B 2.0
# v NaN
# C 3.0
# dtype: float64 #判断是不是null
print(pd.isnull(s5))
# z True
# A False
# B False
# v True
# C False
# dtype: bool
print(pd.notnull(s5))
# z False
# A True
# B True
# v False
# C True
# dtype: bool #给Series起名字
s5.name='demo'
print(s5)
# z NaN
# A 1.0
# B 2.0
# v NaN
# C 3.0
# Name: demo, dtype: float64 s5.index.name='demo index'
print(s5.index)#Index(['z', 'A', 'B', 'v', 'C'], dtype='object', name='demo index')
DataFrame:
from pandas import Series,DataFrame
import webbrowser
link='https://www.tiobe.com/tiobe-index/'
webbrowser.open(link)#打开该网站
#复制网站中一下内容内容
'''
Jan 2019 Jan 2018 Change Programming Language Ratings Change.1
0 1 1 NaN Java 16.904% +2.69%
1 2 2 NaN C 13.337% +2.30%
2 3 4 change Python 8.294% +3.62%
3 4 3 change C++ 8.158% +2.55%
4 5 7 change Visual Basic .NET 6.459% +3.20%
'''
df=pd.read_clipboard()#从剪切板里创建DataFrame
type(df)#pandas.core.frame.DataFrame
print(df)#打印出和上述内容一样的DataFrame
# Jan 2019 Jan 2018 Change Programming Language Ratings Change.1
# 0 1 1 NaN Java 16.904% +2.69%
# 1 2 2 NaN C 13.337% +2.30%
# 2 3 4 change Python 8.294% +3.62%
# 3 4 3 change C++ 8.158% +2.55%
# 4 5 7 change Visual Basic .NET 6.459% +3.20%
#获取列名
print(df.columns)#Index(['Jan 2019', 'Jan 2018', 'Change', 'Programming Language', 'Ratings','Change.1'],dtype='object')
#获取某一列的value
print(df.Ratings)#获取Ratings列
# 0 16.904%
# 1 13.337%
# 2 8.294%
# 3 8.158%
# 4 6.459%
# Name: Ratings, dtype: object
print(df['Jan 2019'])#获取'Jan 2019'列,因为两个单词,所以不能用上式 获取两列则用print(df[['Jan 2019',Ratings]]),得到的类型为DataFrame
# 0 1
# 1 2
# 2 3
# 3 4
# 4 5
# Name: Jan 2019, dtype: int64
print(type(df.Ratings),type(df['Jan 2019']))#<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>
# 提取旧的DataFrame某些列生成新的DataFrame
df_new=DataFrame(df,columns=['Programming Language','Jan 2019'])
print(df_new)
# Programming Language Jan 2019
# 0 Java 1
# 1 C 2
# 2 Python 3
# 3 C++ 4
# 4 Visual Basic .NET 5
#提取旧的DataFrame某些列生成新的DataFrame,但新的DataFrame中有的列在旧的没有,会生成新的列
df_new2=DataFrame(df,columns=['new lie','Jan 2019'])
print(df_new2)
# new lie Jan 2019
# 0 NaN 1
# 1 NaN 2
# 2 NaN 3
# 3 NaN 4
# 4 NaN 5 #可以给new lie赋值
df_new2['new lie']=range(5,10)
df_new2['new lie']=np.arange(5,10)#也可以通过numpy赋值
df_new2['new lie']=pd.Series(np.arange(5,10))#也可以通过Series赋值
print(df_new2)
# new lie Jan 2019
# 0 5 1
# 1 6 2
# 2 7 3
# 3 8 4
# 4 9 5 df_new2['new lie']=pd.Series([200,200],index=[2,3])#指定某一列某一两个元素值的更改
print(df_new2)
# new lie Jan 2019
# 0 NaN 1
# 1 NaN 2
# 2 200.0 3
# 3 200.0 4
# 4 NaN 5
Series与DataFrame:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
data={'country':['Belgium','India','Brazil'],
'Capital':['Brussels','New Delhi','Brasillia'],
'Population':[11190846,1303171035,207847528]} #Seiries
s1=pd.Series(data['country'])
# 0 Belgium
# 1 India
# 2 Brazil
# dtype: object
s1.values#array(['Belgium', 'India', 'Brazil'], dtype=object)
s1.index#RangeIndex(start=0, stop=3, step=1) #DataFrame
df1=pd.DataFrame(data)#通过字典创建DataFrame
# country Capital Population
# 0 Belgium Brussels 11190846
# 1 India New Delhi 1303171035
# 2 Brazil Brasillia 207847528
df1['country']#访问某一列
df1.country##访问某一列的另一种方式,效果同上
# 0 Belgium
# 1 India
# 2 Brazil
# Name: country, dtype: object
type(df1['country'])#pandas.core.series.Series #访问DataFrame的行
df1.iterrows()#<generator object DataFrame.iterrows at 0x0000000004E8F888>
for row in df1.iterrows():
print(row)
print('类型:',type(row))
print('长度:',len(row),'\n')
'''
(0, country Belgium
Capital Brussels
Population 11190846
Name: 0, dtype: object)
类型: <class 'tuple'>
长度: 2 (1, country India
Capital New Delhi
Population 1303171035
Name: 1, dtype: object)
类型: <class 'tuple'>
长度: 2 (2, country Brazil
Capital Brasillia
Population 207847528
Name: 2, dtype: object)
类型: <class 'tuple'>
长度: 2
'''
for row in df1.iterrows():
print('第一个:',row[0])
print('第二个:', row[1],'\n')
print('类型:',type(row[0]),type(row[1]))
break
'''
第一个: 0
第二个: country Belgium
Capital Brussels
Population 11190846
Name: 0, dtype: object 类型: <class 'int'> <class 'pandas.core.series.Series'>
''' #通过Series创建DataFrame
s1=pd.Series(data['Capital'])
s2=pd.Series(data['country'])
s3=pd.Series(data['Population'])
df_new1=pd.DataFrame([s1,s2,s3])
print(df_new1)
'''
0 1 2
0 Brussels New Delhi Brasillia
1 Belgium India Brazil
2 11190846 1303171035 207847528
'''
print(df_new1.T)# 转置
'''
0 1 2
0 Brussels Belgium 11190846
1 New Delhi India 1303171035
2 Brasillia Brazil 207847528
'''
df_new2=pd.DataFrame([s1,s2,s3],index=['Capital','country','Population']).T
print(df_new2)
'''
Capital country Population
0 Brussels Belgium 11190846
1 New Delhi India 1303171035
2 Brasillia Brazil 207847528
'''
pandas中的DateFrame的IO操作:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import webbrowser
link='http://pandas.pydata.org/pandas-docs/version/0.20/io.html'
webbrowser.open(link)#打开该网站
#复制网站中一下内容内容
'''
Format Type Data Description Reader Writer
text CSV read_csv to_csv
text JSON read_json to_json
text HTML read_html to_html
text Local clipboard read_clipboard to_clipboard
binary MS Excel read_excel to_excel
binary HDF5 Format read_hdf to_hdf
binary Feather Format read_feather to_feather
binary Msgpack read_msgpack to_msgpack
binary Stata read_stata to_stata
binary SAS read_sas
binary Python Pickle Format read_pickle to_pickle
SQL SQL read_sql to_sql
SQL Google Big Query read_gbq to_gbq
'''
df1=pd.read_clipboard()
print(df1)
'''
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
'''
df1.to_clipboard()#将df1的内容复制到粘贴板
df1.to_csv('df1.csv')#将df1的内容输出到df1.csv文件中,包括index
df1.to_csv('df11.csv',index=False)#将df1的内容输出到df2.csv文件中,但不包括index
df2=pd.read_csv('df11.csv')#读取csv文件
print(df2)
'''
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
'''
df3=df1.to_json()#输出为json格式
print(df3)
'''
{"Format Type":{"0":"text","1":"text","2":"text","3":"text","4":"binary","5":"binary","6":"binary","7":"binary","8":"binary","9":"binary","10":"binary","11":"SQL","12":"SQL"},
"Data Description":{"0":"CSV","1":"JSON","2":"HTML","3":"Local clipboard","4":"MS Excel","5":"HDF5 Format","6":"Feather Format","7":"Msgpack","8":"Stata","9":"SAS","10":"Python Pickle Format","11":"SQL","12":"Google Big Query"},
"Reader":{"0":"read_csv","1":"read_json","2":"read_html","3":"read_clipboard","4":"read_excel","5":"read_hdf","6":"read_feather","7":"read_msgpack","8":"read_stata","9":"read_sas","10":"read_pickle","11":"read_sql","12":"read_gbq"},
"Writer":{"0":"to_csv","1":"to_json","2":"to_html","3":"to_clipboard","4":"to_excel","5":"to_hdf","6":"to_feather","7":"to_msgpack","8":"to_stata","9":" ","10":"to_pickle","11":"to_sql","12":"to_gbq"}}
'''
print(pd.read_json(df3))#读json格式
'''
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas
'''
df1.to_json('df1.json')#生成json文件
print(pd.read_json('df1.json'))#读取json文件
'''
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas
'''
df1.to_html('df1.html')#生成html文件
print(pd.read_html('df1.html'))#读取html文件
'''
[ Unnamed: 0 Format Type Data Description Reader Writer
0 0 text CSV read_csv to_csv
1 1 text JSON read_json to_json
2 2 text HTML read_html to_html
3 3 text Local clipboard read_clipboard to_clipboard
4 4 binary MS Excel read_excel to_excel
5 5 binary HDF5 Format read_hdf to_hdf
6 6 binary Feather Format read_feather to_feather
7 7 binary Msgpack read_msgpack to_msgpack
8 8 binary Stata read_stata to_stata
9 9 binary SAS read_sas NaN
10 10 binary Python Pickle Format read_pickle to_pickle
11 11 SQL SQL read_sql to_sql
12 12 SQL Google Big Query read_gbq to_gbq]
'''
df1.to_excel('df1.xlsx')#生成excell文件
print(pd.read_excel('df1.xlsx'))#读取excell文件
'''
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
'''
Series和DataFrame的indexing
import numpy as np
import pandas as pd
imdb=pd.read_csv(r'C:\Users\Administrator\Desktop\py_work\codes\presidential_polls.csv')
print(imdb.shape)#查看行列数 (10236, 27)
print(imdb.head())#默认打印前五行
'''
cycle branch type matchup forecastdate \
0 2016 President polls-plus Clinton vs. Trump vs. Johnson 11/1/16
1 2016 President polls-plus Clinton vs. Trump vs. Johnson 11/1/16
2 2016 President polls-plus Clinton vs. Trump vs. Johnson 11/1/16
3 2016 President polls-plus Clinton vs. Trump vs. Johnson 11/1/16
4 2016 President polls-plus Clinton vs. Trump vs. Johnson 11/1/16 state startdate enddate pollster grade \
0 U.S. 10/25/2016 10/31/2016 Google Consumer Surveys B
1 U.S. 10/27/2016 10/30/2016 ABC News/Washington Post A+
2 Virginia 10/27/2016 10/30/2016 ABC News/Washington Post A+
3 Florida 10/20/2016 10/24/2016 SurveyUSA A
4 U.S. 10/20/2016 10/25/2016 Pew Research Center B+ ... adjpoll_clinton adjpoll_trump adjpoll_johnson \
0 ... 42.64140 40.86509 5.675099
1 ... 43.29659 44.72984 3.401513
2 ... 46.29779 40.72604 6.401513
3 ... 46.35931 45.30585 1.777730
4 ... 45.32744 42.20888 3.618320 adjpoll_mcmullin multiversions \
0 NaN NaN
1 NaN NaN
2 NaN NaN
3 NaN NaN
4 NaN NaN url poll_id question_id \
0 https://datastudio.google.com/u/0/#/org//repor... 47940 74999
1 http://www.langerresearch.com/wp-content/uploa... 47881 74936
2 https://www.washingtonpost.com/local/virginia-... 47880 74934
3 http://www.baynews9.com/content/news/baynews9/... 47465 74252
4 http://www.people-press.org/2016/10/27/as-elec... 47616 74519 createddate timestamp
0 11/1/16 15:09:38 1 Nov 2016
1 11/1/16 15:09:38 1 Nov 2016
2 11/1/16 15:09:38 1 Nov 2016
3 10/25/16 15:09:38 1 Nov 2016
4 10/27/16 15:09:38 1 Nov 2016 [5 rows x 27 columns]
'''
print(imdb.tail())#默认打印后5行,与head用法相同
'''
cycle branch type matchup \
10231 2016 President polls-only Clinton vs. Trump vs. Johnson
10232 2016 President polls-only Clinton vs. Trump vs. Johnson
10233 2016 President polls-only Clinton vs. Trump vs. Johnson
10234 2016 President polls-only Clinton vs. Trump vs. Johnson
10235 2016 President polls-only Clinton vs. Trump vs. Johnson forecastdate state startdate enddate \
10231 11/1/16 Alabama 9/30/2016 10/13/2016
10232 11/1/16 Virginia 9/30/2016 10/6/2016
10233 11/1/16 Virginia 9/16/2016 9/22/2016
10234 11/1/16 North Carolina 6/20/2016 6/21/2016
10235 11/1/16 Utah 7/29/2016 8/18/2016 pollster grade ... adjpoll_clinton \
10231 Ipsos A- ... 37.30964
10232 Ipsos A- ... 49.13094
10233 Ipsos A- ... 45.97130
10234 Public Policy Polling B+ ... 45.29390
10235 Ipsos A- ... 31.62721 adjpoll_trump adjpoll_johnson adjpoll_mcmullin multiversions \
10231 54.76821 NaN NaN NaN
10232 39.41588 NaN NaN NaN
10233 39.97518 NaN NaN NaN
10234 46.66175 1.596946 NaN NaN
10235 44.65947 NaN NaN NaN url poll_id \
10231 http://reuters.com/statesofthenation/ 46817
10232 http://www.reuters.com/statesofthenation/ 46675
10233 http://www.reuters.com/statesofthenation/ 46096
10234 http://www.publicpolicypolling.com/pdf/2015/PP... 44400
10235 http://www.reuters.com/statesofthenation 44978 question_id createddate timestamp
10231 73263 10/15/16 14:57:58 1 Nov 2016
10232 72969 10/10/16 14:57:58 1 Nov 2016
10233 72088 9/26/16 14:57:58 1 Nov 2016
10234 67363 6/23/16 14:57:58 1 Nov 2016
10235 69011 8/24/16 14:57:58 1 Nov 2016 [5 rows x 27 columns]
'''
print(imdb.iloc[10:13,0:5])#查看第10到12行,0到4列(iloc通过index搜索的,基于位置信息,类似切片,不包含末尾位置)
'''
cycle branch type matchup forecastdate
10 2016 President polls-plus Clinton vs. Trump vs. Johnson 11/1/16
11 2016 President polls-plus Clinton vs. Trump vs. Johnson 11/1/16
12 2016 President polls-plus Clinton vs. Trump vs. Johnson 11/1/16
'''
df=imdb.iloc[10:13,0:5]
print(df.iloc[1:3,1:3])
'''
branch type
11 President polls-plus
12 President polls-plus
'''
print(df.loc[10:12,:'type'])#loc是通过lable查询的,基于lable信息查询,包含末尾位置
'''
cycle branch type
10 2016 President polls-plus
11 2016 President polls-plus
12 2016 President polls-plus
'''
print(imdb['adjpoll_clinton'].head())#查看某列
'''
0 42.64140
1 43.29659
2 46.29779
3 46.35931
4 45.32744
Name: adjpoll_clinton, dtype: float64
'''
print(imdb['adjpoll_clinton'][10])#查看某个元素 44.53217
print(imdb[['adjpoll_trump','adjpoll_johnson']])#通过某(些)列生成新的DataFrame
'''
adjpoll_trump adjpoll_johnson
0 40.86509 5.675099
1 44.72984 3.401513
2 40.72604 6.401513
3 45.30585 1.777730
4 42.20888 3.618320
5 42.26663 6.114222
6 43.56017 3.153590
7 43.50333 3.466432
8 37.24948 6.420006
9 41.69540 4.220173
10 43.84845 NaN
11 47.92262 2.676897
12 29.50605 3.170510
13 40.34972 5.823322
14 42.01937 6.499082
15 45.07725 3.499082
16 39.33826 5.044833
17 46.11255 3.054228
18 39.80679 6.359501
19 41.34735 4.421316
20 39.99571 6.272840
21 50.75720 NaN
22 38.87231 8.359501
23 41.55637 4.964521
24 43.84806 5.359501
25 45.03370 2.193952
26 44.78595 4.359501
27 44.18040 5.160502
28 40.41809 3.333669
29 49.47709 4.308866
... ... ...
10206 36.75014 9.152230
10207 40.08237 NaN
10208 43.67710 NaN
10209 43.40106 NaN
10210 35.52956 NaN
10211 35.03328 NaN
10212 44.77681 NaN
10213 38.24798 NaN
10214 41.25978 NaN
10215 41.59738 NaN
10216 41.64499 1.974752
10217 36.15054 NaN
10218 38.65057 NaN
10219 29.49314 9.007062
10220 37.87221 NaN
10221 39.42957 NaN
10222 53.95455 NaN
10223 33.07150 3.328916
10224 41.88533 1.974752
10225 36.82408 9.741756
10226 47.80848 NaN
10227 42.01089 3.671217
10228 45.06726 NaN
10229 40.16534 12.889780
10230 41.56030 2.872088
10231 54.76821 NaN
10232 39.41588 NaN
10233 39.97518 NaN
10234 46.66175 1.596946
10235 44.65947 NaN [10236 rows x 2 columns]
'''
Series和DataFrame的Reindexing
import numpy as np
import pandas as pd
s1=pd.Series([1,2,3,4],index=['A','B','C','D'])
print(s1)
A 1
B 2
C 3
D 4
dtype: int64
print(s1.reindex(['A','C','E']))
A 1.0
C 3.0
E NaN
dtype: float64
print(s1.reindex(['A','C','m'],fill_value=11))#通过fille_value填充数值
A 1
C 3
m 11
dtype: int64
s2=pd.Series(['a','b','c'],index=[1,5,10])
print(s2)
1 a
5 b
10 c
dtype: object
print(s2.reindex(index=range(15)))
0 NaN
1 a
2 NaN
3 NaN
4 NaN
5 b
6 NaN
7 NaN
8 NaN
9 NaN
10 c
11 NaN
12 NaN
13 NaN
14 NaN
dtype: object
print(s2.reindex(index=range(15),method='ffill'))#自动填充,第0个是NaN,第1到4用a填充(<=4),第5到9用b填充(大于等于5小于10),大于等于10用c填充
0 NaN
1 a
2 a
3 a
4 a
5 b
6 b
7 b
8 b
9 b
10 c
11 c
12 c
13 c
14 c
dtype: object
print(s2)
1 a
5 b
10 c
dtype: object
df1=pd.DataFrame(np.random.rand(25).reshape([5,5]))
print(df1)
0 1 2 3 4
0 0.499115 0.244375 0.849224 0.348352 0.472657
1 0.676503 0.769790 0.479774 0.468003 0.703029
2 0.153982 0.699009 0.379184 0.151905 0.921860
3 0.904037 0.196925 0.421180 0.384442 0.642122
4 0.641124 0.748790 0.824351 0.101550 0.412564
df2=pd.DataFrame(np.random.rand(25).reshape([5,5]),index=['A','B','D','E','F'],columns=['c1','c2','c3','c4','c5'])
print(df2)
c1 c2 c3 c4 c5
A 0.279563 0.267224 0.077868 0.080046 0.528182
B 0.660053 0.088954 0.512298 0.259552 0.108562
D 0.734865 0.776419 0.581695 0.578712 0.157753
E 0.926365 0.729410 0.328161 0.531319 0.550878
F 0.849754 0.770988 0.537104 0.833631 0.062303
print(df2.reindex(index=['A','B','C','D','E','F']))
c1 c2 c3 c4 c5
A 0.279563 0.267224 0.077868 0.080046 0.528182
B 0.660053 0.088954 0.512298 0.259552 0.108562
C NaN NaN NaN NaN NaN
D 0.734865 0.776419 0.581695 0.578712 0.157753
E 0.926365 0.729410 0.328161 0.531319 0.550878
F 0.849754 0.770988 0.537104 0.833631 0.062303
print(df2.reindex(columns=['c1','c2','c3','c4','c5','c6']))
c1 c2 c3 c4 c5 c6
A 0.279563 0.267224 0.077868 0.080046 0.528182 NaN
B 0.660053 0.088954 0.512298 0.259552 0.108562 NaN
D 0.734865 0.776419 0.581695 0.578712 0.157753 NaN
E 0.926365 0.729410 0.328161 0.531319 0.550878 NaN
F 0.849754 0.770988 0.537104 0.833631 0.062303 NaN
print(df2.reindex(index=['A','B','C','D','E','F'],columns=['c1','c2','c3','c4','c5','c6']))
c1 c2 c3 c4 c5 c6
A 0.279563 0.267224 0.077868 0.080046 0.528182 NaN
B 0.660053 0.088954 0.512298 0.259552 0.108562 NaN
C NaN NaN NaN NaN NaN NaN
D 0.734865 0.776419 0.581695 0.578712 0.157753 NaN
E 0.926365 0.729410 0.328161 0.531319 0.550878 NaN
F 0.849754 0.770988 0.537104 0.833631 0.062303 NaN
s1=pd.Series([1,2,3,4],index=['A','B','C','D'])
print(s1)
A 1
B 2
C 3
D 4
dtype: int64
print(s1.reindex(['A','C']))#也可写成print(s1.reindex(index=['A','C']))
A 1
C 3
dtype: int64
print(df2.reindex(index=['A','C']))
c1 c2 c3 c4 c5
A 0.279563 0.267224 0.077868 0.080046 0.528182
C NaN NaN NaN NaN NaN
print(s1.drop(['B','C']))
A 1
D 4
dtype: int64
print(s1.drop('A'))
B 2
C 3
D 4
dtype: int64
print(df2.drop(['A'],axis=0))
c1 c2 c3 c4 c5
B 0.660053 0.088954 0.512298 0.259552 0.108562
D 0.734865 0.776419 0.581695 0.578712 0.157753
E 0.926365 0.729410 0.328161 0.531319 0.550878
F 0.849754 0.770988 0.537104 0.833631 0.062303
print(df2.drop(['c1'],axis=1))
c2 c3 c4 c5
A 0.267224 0.077868 0.080046 0.528182
B 0.088954 0.512298 0.259552 0.108562
D 0.776419 0.581695 0.578712 0.157753
E 0.729410 0.328161 0.531319 0.550878
F 0.770988 0.537104 0.833631 0.062303
谈一谈NaN-means Not a Number
n=np.nan
print(type(n))#<class 'float'>
print(1+n)#结果:nan 任何一个numuber与nan做运算结果永远都是not a nunmber
s1=pd.Series([1,2,np.nan,3,4],index=['A','B','C','D','E'])
print(s1)
A 1.0
B 2.0
C NaN
D 3.0
E 4.0
dtype: float64
print(s1.isnull())
A False
B False
C True
D False
E False
dtype: bool
View Cod
print(s1.notnull())
A True
B True
C False
D True
E True
dtype: bool
print(s1.dropna())#drop掉value为nan的
A 1.0
B 2.0
D 3.0
E 4.0
dtype: float64
NaN in DataFrame
dframe=pd.DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]])
print(dframe)
0 1 2
0 1.0 2.0 3.0
1 NaN 5.0 6.0
2 7.0 NaN 9.0
3 NaN NaN NaN
print(dframe.isnull())
0 1 2
0 False False False
1 True False False
2 False True False
3 True True True
print(dframe.notnull())
0 1 2
0 True True True
1 False True True
2 True False True
3 False False False
print(dframe.dropna())#默认axis=0,相当于print(dframe.dropna(axis=0)) 默认how='any'
0 1 2
0 1.0 2.0 3.0
print(dframe.dropna(how='any'))#any指的是凡是含有nan的都会drop掉
0 1 2
0 1.0 2.0 3.0
print(dframe.dropna(how='all'))#all指的是所有都是all的都会drop掉
0 1 2
0 1.0 2.0 3.0
1 NaN 5.0 6.0
2 7.0 NaN 9.0
print(dframe.dropna(axis=1))#只剩下index了
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3]
dframe2=pd.DataFrame([[1,2,3,np.nan],[2,np.nan,5,6],[np.nan,7,np.nan,9],[1,np.nan,np.nan,np.nan]])
print(dframe2)
0 1 2 3
0 1.0 2.0 3.0 NaN
1 2.0 NaN 5.0 6.0
2 NaN 7.0 NaN 9.0
3 1.0 NaN NaN NaN
df2=dframe2.dropna()#默认thresh=None,相当于df2=dframe2.dropna(thresh=None)
print(df2)
Empty DataFrame
Columns: [0, 1, 2, 3]
Index: []
df3=dframe2.dropna(thresh=2)#只要一行中NaN个数大于2,就删除该行
print(df3)
0 1 2 3
0 1.0 2.0 3.0 NaN
1 2.0 NaN 5.0 6.0
2 NaN 7.0 NaN 9.0
print(dframe2.fillna(value=10))#将NaN填充为10
0 1 2 3
0 1.0 2.0 3.0 10.0
1 2.0 10.0 5.0 6.0
2 10.0 7.0 10.0 9.0
3 1.0 10.0 10.0 10.0
print(dframe2.fillna(value={0:'A',1:'16',2:'中国',3:'k'}))#将每列各自的NaN赋值,即:第0列用A填充,第1列用16填充。。。。。。
#注意:fillna和dropna不会改变原本的Series和DataFrame
0 1 2 3
0 1 2 3 k
1 2 16 5 6
2 A 7 中国 9
3 1 16 中国 k
多级index
s1=pd.Series(np.random.rand(6),index=[['1','1','1','2','2','2'],['a','b','c','a','b','c']])
print(s1)
1 a 0.973831
b 0.762415
c 0.135763
2 a 0.974687
b 0.471638
c 0.573157
dtype: float64
print(type(s1))#<class 'pandas.core.series.Series'>
print(s1['1'])
a 0.973831
b 0.762415
c 0.135763
dtype: float64
print(type(s1['1']))#<class 'pandas.core.series.Series'>
print(s1['1']['a'])#0.9738309965219155
print(s1[:,'a'])
1 0.973831
2 0.974687
dtype: float64
#二级的series转换成dataframe(两种方法)
df1=s1.unstack()
print(df1) df2=pd.DataFrame([s1['1'],s1['2']])
print(df2)
a b c
1 0.973831 0.762415 0.135763
2 0.974687 0.471638 0.573157
a b c
0 0.973831 0.762415 0.135763
1 0.974687 0.471638 0.573157
#dataframe转换成二级series
s2=df1.unstack()
print(s2)
a 1 0.973831
2 0.974687
b 1 0.762415
2 0.471638
c 1 0.135763
2 0.573157
dtype: float64
print(df1.T.unstack())
1 a 0.973831
b 0.762415
c 0.135763
2 a 0.974687
b 0.471638
c 0.573157
dtype: float64
df=pd.DataFrame(np.arange(16).reshape(4,4),index=[['a','a','b','b'],[1,2,1,2]],columns=[['BJ','BJ','上海','广州'],[111,222,111,222]])
print(df)
BJ 上海 广州
111 222 111 222
a 1 0 1 2 3
2 4 5 6 7
b 1 8 9 10 11
2 12 13 14 15
print(df['BJ'])
111 222
a 1 0 1
2 4 5
b 1 8 9
2 12 13
print(type(df['BJ']))#<class 'pandas.core.frame.DataFrame'>
print(df['BJ',111])#print(df['BJ'][111])效果相同
a 1 0
2 4
b 1 8
2 12
Name: (BJ, 111), dtype: int32
Mapping
df1=pd.DataFrame({'城市':['北京','上海','广州'],'人口':[1000,2000,1500]})
print(df1)
城市 人口
0 北京 1000
1 上海 2000
2 广州 1500
#增加一列
df1['GDP']=pd.Series([9999,8888,7777])#方法一
print(df1)
城市 人口 GDP
0 北京 1000 9999
1 上海 2000 8888
2 广州 1500 7777
salary={'北京':10,'上海':20,'广州':30}#方法二,尽量用此方法,原因看df2
df1['工资']=df1['城市'].map(salary)
print(df1)
城市 人口 GDP 工资
0 北京 1000 9999 10
1 上海 2000 8888 20
2 广州 1500 7777 30
df2=pd.DataFrame({'城市':['北京','上海','广州'],'人口':[1000,2000,1500]},index=['A','B','C'])
print(df2)
城市 人口
A 北京 1000
B 上海 2000
C 广州 1500
df2['GDP']=pd.Series([9999,8888,7777])
print(df2)
城市 人口 GDP
A 北京 1000 NaN
B 上海 2000 NaN
C 广州 1500 NaN
df2['GGDDPP']=pd.Series([9999,8888,7777],index=['A','B','C'])
print(df2)
城市 人口 GDP GGDDPP
A 北京 1000 NaN 9999
B 上海 2000 NaN 8888
C 广州 1500 NaN 7777
Replace
s1=pd.Series(np.arange(100,105))
print(s1)
0 100
1 101
2 102
3 103
4 104
dtype: int32
print(s1.replace(101,np.nan))
0 100.0
1 NaN
2 102.0
3 103.0
4 104.0
dtype: float64
print(s1.replace({101:np.nan}))
0 100.0
1 NaN
2 102.0
3 103.0
4 104.0
dtype: float64
print(s1.replace([100,103,104],['中','eng','s']))
0 中
1 101
2 102
3 eng
4 s
dtype: object
print(s1)#s1并没有发生变化
0 100
1 101
2 102
3 103
4 104
dtype: int32
二、pandas入门的更多相关文章
- 利用Python进行数据分析——pandas入门
利用Python进行数据分析--pandas入门 基于NumPy建立的 from pandas importSeries,DataFrame,import pandas as pd 一.两种数据结构 ...
- 利用python进行数据分析之pandas入门
转自https://zhuanlan.zhihu.com/p/26100976 目录: 5.1 pandas 的数据结构介绍5.1.1 Series5.1.2 DataFrame5.1.3索引对象5. ...
- < 利用Python进行数据分析 - 第2版 > 第五章 pandas入门 读书笔记
<利用Python进行数据分析·第2版>第五章 pandas入门--基础对象.操作.规则 python引用.浅拷贝.深拷贝 / 视图.副本 视图=引用 副本=浅拷贝/深拷贝 浅拷贝/深拷贝 ...
- XML学习总结(二)——XML入门
XML学习总结(二)——XML入门 一.XML语法学习 学习XML语法的目的就是编写XML 一个XML文件分为如下几部分内容: 文档声明 元素 属性 注释 CDATA区 .特殊字符 处理指令(proc ...
- Spring+SpringMVC+MyBatis深入学习及搭建(十二)——SpringMVC入门程序(一)
转载请注明出处:http://www.cnblogs.com/Joanna-Yan/p/6999743.html 前面讲到:Spring+SpringMVC+MyBatis深入学习及搭建(十一)——S ...
- 基于tensorflow的MNIST手写数字识别(二)--入门篇
http://www.jianshu.com/p/4195577585e6 基于tensorflow的MNIST手写字识别(一)--白话卷积神经网络模型 基于tensorflow的MNIST手写数字识 ...
- 转:JAVAWEB开发之权限管理(二)——shiro入门详解以及使用方法、shiro认证与shiro授权
原文地址:JAVAWEB开发之权限管理(二)——shiro入门详解以及使用方法.shiro认证与shiro授权 以下是部分内容,具体见原文. shiro介绍 什么是shiro shiro是Apache ...
- Python 数据处理库 pandas 入门教程
Python 数据处理库 pandas 入门教程2018/04/17 · 工具与框架 · Pandas, Python 原文出处: 强波的技术博客 pandas是一个Python语言的软件包,在我们使 ...
- 深入浅出 JMS(二) - ActiveMQ 入门指南
深入浅出 JMS(二) - ActiveMQ 入门指南 上篇博文深入浅出 JMS(一) – JMS 基本概念,我们介绍了消息通信的规范JMS,这篇博文介绍一款开源的 JMS 具体实现-- Active ...
- 利用python进行数据分析--pandas入门2
随书练习,第五章 pandas入门2 # coding: utf-8 # In[1]: from pandas import Series,DataFrame import pandas as pd ...
随机推荐
- QDUOJ LC的课后辅导 单调递增栈
LC的课后辅导 发布时间: 2015年9月19日 21:42 时间限制: 1000ms 内存限制: 256M 描述 有一天,LC给我们出了一道题,如图: 这个图形从左到右由若干个 宽为1 高不 ...
- 【转】Win8 下 管理无线网络
Ref:http://windows.microsoft.com/zh-CN/windows-8/manage-wireless-network-profiles 管理无线网络配置文件 适用于 Win ...
- ASP.NET中的几种弹出框提示基本方法
NET程序的开发过程中,常常需要和用户进行信息交互,对话框的出现将解决了这些问题,下面是本人对常用对话框使用的小结,希望对大家有所帮助 我们在.NET程序的开发过程中,常常需要和用户进行信息交互,比如 ...
- E20180519-hm
distinct adj. 明显的,清楚的; 卓越的,不寻常的; 有区别的; 确切的;
- 零基础配置Linux服务器环境
详细步骤请走官方通道 over!over!over!
- vc编程中出现 fatal error C1010: 在查找预编译头时遇到意外的文件结尾。是否忘记了向源中添加“#include "stdafx.h"”?
解决办法菜单--〉项目--〉设置,出现“项目设置”对话框,左边展开项目,在“源文件”中找到出错的文件,然后在右边选择“C/C++”属性 页,在Category下拉框中选择“Precompiled He ...
- 删除node_modul模块
npm安装rimraf ,npm版本号要是低于5.x.x 具体不记得了,不然就安装不了这个工具 npm install rimraf -g 然后: rimraf node_modules 在这里学到的 ...
- Matplotlib 在绘画bar时, 鼠标响应点击 bar 的消息
官方教程: http://urania.udea.edu.co/sitios/astronomia-2.0/pages/descargas.rs/files/descargasdt5vi/Cursos ...
- SpringMVC注解校验
spring注解式参数校验 版权声明:本文为博主原创文章,未经博主允许不得转载. https://blog.csdn.net/jinzhencs/article/details/5168283 ...
- Cordova/Cordova.h file not found的解决方法
参考http://stackoverflow.com/questions/10714600/cdvplugin-h-file-not-found-in-cordova-as-component-cle ...