pandas-Notes1

#coding = utf-8

import pandas as pd

import numpy as np

import  matplotlib as plt

# series, like vector, vertical aligned.

s = pd.Series([1,2,np.nan,3])

print s

'''

0    1.0

1    2.0

2    NaN

3    3.0

dtype: float64

'''

##################################################

# pd.DataFrame like data.frame in R

# create DataFrame from matrix.

# freq='D' means day

dates = pd.date_range('20170601', periods=6)

print dates

'''

DatetimeIndex(['2017-06-01', '2017-06-02', '2017-06-03', '2017-06-04',

               '2017-06-05', '2017-06-06'],

              dtype='datetime64[ns]', freq='D')

'''

# np.random.randn(d0,d1..dn) return 6*4 matrix whose data are

# random floats sampled from a univariate "normal" distribution of mean 0 and variance 1

# index are rownames; columns are colnames

df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

print df

'''

                   A         B         C         D

2017-06-01 -0.463965  0.960470 -0.186808 -1.198540

2017-06-02  0.267148 -0.599783  2.143011  1.211458

2017-06-03 -1.516629  1.228905  0.949323  0.127440

2017-06-04 -0.509237  0.387529  0.108155 -0.478422

2017-06-05  0.600630  0.776675  1.906076 -0.382445

2017-06-06  0.566325  1.189855  0.206210  2.334218

'''

# create from dict of objects

df2 = pd.DataFrame({'A' : 1., # float64

                    # pandas's date class, datetime64[ns]

                    'B' : pd.Timestamp('20170601'),

                    # index are rownames.

                    'C' : pd.Series(1, index=list(range(4)),dtype='float32'),

                    # array

                    'D' : np.array([1] * 4, dtype='int32'),

                    'E' : pd.Categorical(["test", "train", "test", "train"]),

                    #'F' : 'foo' shows error. Use Series instead.

                    'F' : pd.Series(['foo'] * 4, dtype='object')

                    })

print df2.dtypes

'''

A           float64

B    datetime64[ns]

C           float32

D             int32

E          category

F            object

dtype: object

'''

print df2

'''

     A          B    C  D      E    F

0  1.0 2017-06-01  1.0  1   test  foo

1  1.0 2017-06-01  1.0  1  train  foo

2  1.0 2017-06-01  1.0  1   test  foo

3  1.0 2017-06-01  1.0  1  train  foo

'''

# view colnames and first n lines or last n lines

print df2.head(2)

print df2.tail(3)

print df2.index

print df2.columns

'''

Int64Index([0, 1, 2, 3], dtype='int64')

Index([u'A', u'B', u'C', u'D', u'E', u'F'], dtype='object')

'''

# remove index and columns

print df2.values

# statistic summary to data

print df.describe()

'''

              A         B         C         D

count  6.000000  6.000000  6.000000  6.000000

mean  -0.175955  0.657275  0.854328  0.268951

std    0.817537  0.688410  0.983534  1.289192

min   -1.516629 -0.599783 -0.186808 -1.198540

25%   -0.497919  0.484815  0.132669 -0.454428

50%   -0.098408  0.868573  0.577766 -0.127502

75%    0.491531  1.132509  1.666888  0.940453

max    0.600630  1.228905  2.143011  2.334218

'''

# transpose data

print df.T

print df

'''

                   A         B         C         D

2017-06-01 -0.463965  0.960470 -0.186808 -1.198540

2017-06-02  0.267148 -0.599783  2.143011  1.211458

2017-06-03 -1.516629  1.228905  0.949323  0.127440

2017-06-04 -0.509237  0.387529  0.108155 -0.478422

2017-06-05  0.600630  0.776675  1.906076 -0.382445

2017-06-06  0.566325  1.189855  0.206210  2.334218

'''

# axis = 0 means sort by index, axis = 1 means sort by columns

print df.sort_index(axis=0, ascending=False)

'''

                   A         B         C         D

2017-06-06  0.566325  1.189855  0.206210  2.334218

2017-06-05  0.600630  0.776675  1.906076 -0.382445

2017-06-04 -0.509237  0.387529  0.108155 -0.478422

2017-06-03 -1.516629  1.228905  0.949323  0.127440

2017-06-02  0.267148 -0.599783  2.143011  1.211458

2017-06-01 -0.463965  0.960470 -0.186808 -1.198540

'''

print df.sort_values(by='B')

'''

                   A         B         C         D

2017-06-02  0.267148 -0.599783  2.143011  1.211458

2017-06-04 -0.509237  0.387529  0.108155 -0.478422

2017-06-05  0.600630  0.776675  1.906076 -0.382445

2017-06-01 -0.463965  0.960470 -0.186808 -1.198540

2017-06-06  0.566325  1.189855  0.206210  2.334218

2017-06-03 -1.516629  1.228905  0.949323  0.127440

'''

##################################################

# extract data from DataFrame

##################################################

# simple get

# slice rows. use number or index

print df[0:3]

print df['20170601':'20170603']

# slice col. return Series

print df['A']

# by Label

# print first row

print df.loc[dates[0]]

# select some row and some col

print df.loc[: , ['A','B']]

# to get fast access to a scalar. use at

print df.at[dates[0], 'A']

# by position

# print first row

print df.iloc[0]

print df.iloc[3:5, 0:2]

# faster access!!!!

# only integer index. : is not allowed.

print df.iat[1,1]

# boolean index

print df

'''

                   A         B         C         D

2017-06-01 -0.463965  0.960470 -0.186808 -1.198540

2017-06-02  0.267148 -0.599783  2.143011  1.211458

2017-06-03 -1.516629  1.228905  0.949323  0.127440

2017-06-04 -0.509237  0.387529  0.108155 -0.478422

2017-06-05  0.600630  0.776675  1.906076 -0.382445

2017-06-06  0.566325  1.189855  0.206210  2.334218

'''

# print rows of value A>0

print df[df.A > 0]

'''

                   A         B         C         D

2017-06-02  0.267148 -0.599783  2.143011  1.211458

2017-06-05  0.600630  0.776675  1.906076 -0.382445

2017-06-06  0.566325  1.189855  0.206210  2.334218

'''

# print only positive values. others are NaN

print df[df > 0]

'''

                   A         B         C         D

2017-06-01       NaN  0.960470       NaN       NaN

2017-06-02  0.267148       NaN  2.143011  1.211458

2017-06-03       NaN  1.228905  0.949323  0.127440

2017-06-04       NaN  0.387529  0.108155       NaN

2017-06-05  0.600630  0.776675  1.906076       NaN

2017-06-06  0.566325  1.189855  0.206210  2.334218

'''

# copy a DataFrame

df3 = df.copy()

df3['E'] = ['one', 'one', 'two', 'three', 'four', 'five']

print df3

'''

                   A         B         C         D      E

2017-06-01 -0.463965  0.960470 -0.186808 -1.198540    one

2017-06-02  0.267148 -0.599783  2.143011  1.211458    one

2017-06-03 -1.516629  1.228905  0.949323  0.127440    two

2017-06-04 -0.509237  0.387529  0.108155 -0.478422  three

2017-06-05  0.600630  0.776675  1.906076 -0.382445   four

2017-06-06  0.566325  1.189855  0.206210  2.334218   five

'''

# print selected rows with E.value='two' or 'five'

print df3[df3['E'].isin(['two', 'five'])]

'''

                   A         B         C         D     E

2017-06-03 -1.516629  1.228905  0.949323  0.127440   two

2017-06-06  0.566325  1.189855  0.206210  2.334218  five

'''

# add another col. or use Series

df3.loc[:,'F'] = np.array(['hello'] * len(df3))

print df3

'''

                   A         B         C         D      E      F

2017-06-01 -0.246362 -1.968794  0.596064  1.656667    one  hello

2017-06-02  0.212728  0.931468 -0.977221 -1.709449    one  hello

2017-06-03 -0.129513  1.911554  0.998007  0.867370    two  hello

2017-06-04  0.688660  0.010904 -0.391857  1.546751  three  hello

2017-06-05  0.283462  0.082037 -1.050666  1.092778   four  hello

2017-06-06 -1.084382  0.560529 -1.497804 -0.709840   five  hello

'''

##################################################

# NaN

##################################################

# dates has been defined at first

# reindex : change/add/delete index

df4 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])

# uninitiated value will be NaN

df4.loc[dates[0], 'E'] = 1

print df4

'''

                   A         B         C         D    E

2017-06-01  0.142853  0.380009 -1.268463  0.463704  1.0

2017-06-02  0.831730  1.615873  0.657926  1.323841  NaN

2017-06-03 -0.739303  0.524235  0.877496  1.065300  NaN

2017-06-04  0.785783 -0.655868  0.631207  1.365685  NaN

'''

# judge if there is NaN or not

# return a DataFrame filled with true or false

print pd.isnull(df4)

# drop na

print df4.dropna(how='any')

'''

                   A         B         C         D    E

2017-06-01  0.071516  0.377737  1.203327  0.711661  1.0

'''

# fill NaN with some number

print df4.fillna(value=5)

pandas-Notes1的更多相关文章

pandas基础-Python3
未完 for examples: example 1: # Code based on Python 3.x # _*_ coding: utf-8 _*_ # __Author: "LEM ...
10 Minutes to pandas
摘要一.创建对象二.查看数据三.选择和设置四.缺失值处理五.相关操作六.聚合七.重排(Reshaping) 八.时间序列九.Categorical类型十.画图十一 ...
利用Python进行数据分析(15) pandas基础: 字符串操作
字符串对象方法 split()方法拆分字符串: strip()方法去掉空白符和换行符: split()结合strip()使用: "+"符号可以将多个字符串连接起来: join( ...
利用Python进行数据分析(10) pandas基础: 处理缺失数据
数据不完整在数据分析的过程中很常见. pandas使用浮点值NaN表示浮点和非浮点数组里的缺失数据. pandas使用isnull()和notnull()函数来判断缺失情况. 对于缺失数据一般处理 ...
利用Python进行数据分析(12) pandas基础: 数据合并
pandas 提供了三种主要方法可以对数据进行合并: pandas.merge()方法:数据库风格的合并: pandas.concat()方法:轴向连接,即沿着一条轴将多个对象堆叠到一起: 实例方法c ...
利用Python进行数据分析(9) pandas基础: 汇总统计和计算
pandas 对象拥有一些常用的数学和统计方法. 例如,sum() 方法,进行列小计: sum() 方法传入 axis=1 指定为横向汇总,即行小计: idxmax() 获取最大值对应的索 ...
利用Python进行数据分析(8) pandas基础: Series和DataFrame的基本操作
一.reindex() 方法:重新索引针对 Series 重新索引指的是根据index参数重新进行排序. 如果传入的索引值在数据里不存在,则不会报错,而是添加缺失值的新行. 不想用缺失值,可以用 ...
利用Python进行数据分析(7) pandas基础: Series和DataFrame的简单介绍
一.pandas 是什么 pandas 是基于 NumPy 的一个 Python 数据分析包,主要目的是为了数据分析.它提供了大量高级的数据结构和对数据处理的方法. pandas 有两个主要的数据结构 ...
pandas.DataFrame对行和列求和及添加新行和列
导入模块: from pandas import DataFrame import pandas as pd import numpy as np 生成DataFrame数据 df = DataFra ...
pandas.DataFrame排除特定行
使用Python进行数据分析时,经常要使用到的一个数据结构就是pandas的DataFrame 如果我们想要像Excel的筛选那样,只要其中的一行或某几行,可以使用isin()方法,将需要的行的值以列 ...

随机推荐

Java中
JavaSE---jar文件
1.当一个应用程序开发完成后,大致有3种方式发布: 1.1 使用平台相关的编译器将整个应用编译成平台相关的可执行文件: 1.2 为整个应用编辑一个批处理文件: 1.3 将应用程序制作为一个可执行的ja ...
scrapy-redis 分布式哔哩哔哩网站用户爬虫
scrapy里面,对每次请求的url都有一个指纹,这个指纹就是判断url是否被请求过的.默认是开启指纹即一个URL请求一次.如果我们使用分布式在多台机上面爬取数据,为了让爬虫的数据不重复,我们也需要一 ...
Ubuntu常用指令集
Ubuntu Linux 操作系统常用命令详细介绍 ( 1)Udo apt-get install 软件名安装软件命令 sudo nautilus 打开文件(有 root 权限)su root 切换 ...
MVC中验证码的简单使用
首先新建一个MVC项目添加类:验证码帮助类(ValidateCodeHelper) using System; using System.Collections.Generic; using Sys ...
Access denied for user ''@'localhost' (using password: NO)之idea坑~
idea启动sql连接远程数据库时发生错误: 发现是sql连接配置问题: spring: datasource: data-username: root data-password: 123456 u ...
js浮点数乘除法
JS在处理浮点数计算时经常会遇到精度的问题,上一篇博客封装了JS浮点数加减法的方法,这一次来封装一下js浮点数乘除法运算. 其实浮点除法的封装跟加减法的封装原理是一样,只是在第一次计算完后会再复位小数 ...
使用AuthToken架构保护用户帐号验证Cookie的安全性
在项目或者网站开发中,我们很多人很多时候喜欢使用微软的FormsAuthentication类的GetAuthCookie函数生成需要在访客客户端放置的帐号校验Cookie,这个本身没问题,但是很多人 ...
Bootstrap下拉菜单相关
1.实现普通下拉菜单:.dropdown>button.dropdown-toggle[data-toggle="dropdown"]+ul.dropdown-menu; 2 ...
Python+selenium之截图图片并保存截取的图片
本文转载:http://blog.csdn.net/u011541946/article/details/70141488 http://www.cnblogs.com/timsheng/archiv ...

pandas-Notes1

pandas-Notes1的更多相关文章

随机推荐

热门专题