Data Preparation in Pandas

Data cleaning

import pandas as pd

import numpy as np

string_data=pd.Series(['aardvark','artichoke',np.nan,'avocado']);string_data

0     aardvark

1    artichoke

2          NaN

3      avocado

dtype: object

string_data.isnull()

0    False

1    False

2     True

3    False

dtype: bool

string_data[2]

nan

from numpy import nan as NA

data=pd.Series([1,NA,3.5,NA,7])

data.dropna()

0    1.0

2    3.5

4    7.0

dtype: float64

data[[False,True,True,False,False]]

1    NaN

2    3.5

dtype: float64

data[data.notnull()]

0    1.0

2    3.5

4    7.0

dtype: float64

data=pd.DataFrame([[1,6.5,3],[1,NA,NA],[NA,NA,NA],[NA,6.5,3]]);data

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
2	NaN	NaN	NaN
3	NaN	6.5	3.0

data.dropna()

	0	1	2
0	1.0	6.5	3.0

data.dropna(how='all')

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
3	NaN	6.5	3.0

data[4]=NA;data

	0	1	2	4
0	1.0	6.5	3.0	NaN
1	1.0	NaN	NaN	NaN
2	NaN	NaN	NaN	NaN
3	NaN	6.5	3.0	NaN

data.dropna(how='all',axis='columns')

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
2	NaN	NaN	NaN
3	NaN	6.5	3.0

df=pd.DataFrame(np.random.randn(7,3))

df

	0	1	2
0	-1.744196	-0.281787	-0.963212
1	-1.114174	0.024707	0.095524
2	0.879205	-1.272202	-0.317218
3	0.227725	-0.067809	0.609824
4	-1.082470	-1.230476	-1.616135
5	-1.218976	0.018245	-0.155761
6	-0.607157	-0.641986	-0.406378

help(np.random.randn)

Help on built-in function randn:

randn(...) method of mtrand.RandomState instance

    randn(d0, d1, ..., dn)

    Return a sample (or samples) from the "standard normal" distribution.

    If positive, int_like or int-convertible arguments are provided,

    `randn` generates an array of shape ``(d0, d1, ..., dn)``, filled

    with random floats sampled from a univariate "normal" (Gaussian)

    distribution of mean 0 and variance 1 (if any of the :math:`d_i` are

    floats, they are first converted to integers by truncation). A single

    float randomly sampled from the distribution is returned if no

    argument is provided.

    This is a convenience function.  If you want an interface that takes a

    tuple as the first argument, use `numpy.random.standard_normal` instead.

    Parameters

    ----------

    d0, d1, ..., dn : int, optional

        The dimensions of the returned array, should be all positive.

        If no argument is given a single Python float is returned.

    Returns

    -------

    Z : ndarray or float

        A ``(d0, d1, ..., dn)``-shaped array of floating-point samples from

        the standard normal distribution, or a single such float if

        no parameters were supplied.

    See Also

    --------

    random.standard_normal : Similar, but takes a tuple as its argument.

    Notes

    -----

    For random samples from :math:`N(\mu, \sigma^2)`, use:

    ``sigma * np.random.randn(...) + mu``

    Examples

    --------

    >>> np.random.randn()

    2.1923875335537315 #random

    Two-by-four array of samples from N(3, 6.25):

    >>> 2.5 * np.random.randn(2, 4) + 3

    array([[-4.49401501,  4.00950034, -1.81814867,  7.29718677],  #random

           [ 0.39924804,  4.68456316,  4.99394529,  4.84057254]]) #random

df

	0	1	2
0	-1.744196	-0.281787	-0.963212
1	-1.114174	0.024707	0.095524
2	0.879205	-1.272202	-0.317218
3	0.227725	-0.067809	0.609824
4	-1.082470	-1.230476	-1.616135
5	-1.218976	0.018245	-0.155761
6	-0.607157	-0.641986	-0.406378

df.iloc[:4,1]=NA;df

	0	1	2
0	-1.744196	NaN	-0.963212
1	-1.114174	NaN	0.095524
2	0.879205	NaN	-0.317218
3	0.227725	NaN	0.609824
4	-1.082470	-1.230476	-1.616135
5	-1.218976	0.018245	-0.155761
6	-0.607157	-0.641986	-0.406378

df.iloc[:2,2]=NA;df

	0	1	2
0	-1.744196	NaN	NaN
1	-1.114174	NaN	NaN
2	0.879205	NaN	-0.317218
3	0.227725	NaN	0.609824
4	-1.082470	-1.230476	-1.616135
5	-1.218976	0.018245	-0.155761
6	-0.607157	-0.641986	-0.406378

df.dropna()

	0	1	2
4	-1.082470	-1.230476	-1.616135
5	-1.218976	0.018245	-0.155761
6	-0.607157	-0.641986	-0.406378

df.dropna(thresh=2)

	0	1	2
2	0.879205	NaN	-0.317218
3	0.227725	NaN	0.609824
4	-1.082470	-1.230476	-1.616135
5	-1.218976	0.018245	-0.155761
6	-0.607157	-0.641986	-0.406378

df.fillna(0)

	0	1	2
0	-1.744196	0.000000	0.000000
1	-1.114174	0.000000	0.000000
2	0.879205	0.000000	-0.317218
3	0.227725	0.000000	0.609824
4	-1.082470	-1.230476	-1.616135
5	-1.218976	0.018245	-0.155761
6	-0.607157	-0.641986	-0.406378

df.fillna({1:0.5,2:0})

	0	1	2
0	-1.744196	0.500000	0.000000
1	-1.114174	0.500000	0.000000
2	0.879205	0.500000	-0.317218
3	0.227725	0.500000	0.609824
4	-1.082470	-1.230476	-1.616135
5	-1.218976	0.018245	-0.155761
6	-0.607157	-0.641986	-0.406378

df

	0	1	2
0	-1.744196	NaN	NaN
1	-1.114174	NaN	NaN
2	0.879205	NaN	-0.317218
3	0.227725	NaN	0.609824
4	-1.082470	-1.230476	-1.616135
5	-1.218976	0.018245	-0.155761
6	-0.607157	-0.641986	-0.406378

df.fillna(0,inplace=True)

df

	0	1	2
0	-1.744196	0.000000	0.000000
1	-1.114174	0.000000	0.000000
2	0.879205	0.000000	-0.317218
3	0.227725	0.000000	0.609824
4	-1.082470	-1.230476	-1.616135
5	-1.218976	0.018245	-0.155761
6	-0.607157	-0.641986	-0.406378

df=pd.DataFrame(np.random.randn(6,3))

df.iloc[2:,1]=NA

df.iloc[4:,2]=NA

df

	0	1	2
0	-0.970921	-1.311345	0.779965
1	-0.352837	0.290834	-0.440396
2	0.574406	NaN	2.034865
3	0.088611	NaN	-0.004141
4	0.792289	NaN	NaN
5	0.668345	NaN	NaN

df.fillna(method='ffill')

	0	1	2
0	-0.970921	-1.311345	0.779965
1	-0.352837	0.290834	-0.440396
2	0.574406	0.290834	2.034865
3	0.088611	0.290834	-0.004141
4	0.792289	0.290834	-0.004141
5	0.668345	0.290834	-0.004141

df

	0	1	2
0	-0.970921	-1.311345	0.779965
1	-0.352837	0.290834	-0.440396
2	0.574406	NaN	2.034865
3	0.088611	NaN	-0.004141
4	0.792289	NaN	NaN
5	0.668345	NaN	NaN

df.dropna()

	0	1	2
0	-0.970921	-1.311345	0.779965
1	-0.352837	0.290834	-0.440396

df.dropna(thresh=2)

	0	1	2
0	-0.970921	-1.311345	0.779965
1	-0.352837	0.290834	-0.440396
2	0.574406	NaN	2.034865
3	0.088611	NaN	-0.004141

df.dropna(thresh=2,inplace=True)

df

	0	1	2
0	-0.970921	-1.311345	0.779965
1	-0.352837	0.290834	-0.440396
2	0.574406	NaN	2.034865
3	0.088611	NaN	-0.004141

data=pd.DataFrame({'K1':['one','two']*3+['two'],'K2':[1,1,2,3,3,4,4]});data

	K1	K2
0	one	1
1	two	1
2	one	2
3	two	3
4	one	3
5	two	4
6	two	4

data.duplicated()

0    False

1    False

2    False

3    False

4    False

5    False

6     True

dtype: bool

data.drop_duplicates()

	K1	K2
0	one	1
1	two	1
2	one	2
3	two	3
4	one	3
5	two	4

data['v1']=range(7)

data

	K1	K2	v1
0	one	1	0
1	two	1	1
2	one	2	2
3	two	3	3
4	one	3	4
5	two	4	5
6	two	4	6

data.drop_duplicates(['K1','K2'])

	K1	K2	v1
0	one	1	0
1	two	1	1
2	one	2	2
3	two	3	3
4	one	3	4
5	two	4	5

df

	0	1	2
0	-0.970921	-1.311345	0.779965
1	-0.352837	0.290834	-0.440396
2	0.574406	NaN	2.034865
3	0.088611	NaN	-0.004141

data

	K1	K2	v1
0	one	1	0
1	two	1	1
2	one	2	2
3	two	3	3
4	one	3	4
5	two	4	5
6	two	4	6

data.drop_duplicates(['K1','K2'])

	K1	K2	v1
0	one	1	0
1	two	1	1
2	one	2	2
3	two	3	3
4	one	3	4
5	two	4	5

Transforming Data Using a Function or Mapping

import pandas as pd

import numpy as np

data=pd.DataFrame({'food':['bacon','pulled pork','bacon','pastrami','corned beef','Bacon','Pastrami','honey ham','nova lox'],

                  'ounces':[4,3,12,6,7.5,8,3,5,6]});data

	food	ounces
0	bacon	4.0
1	pulled pork	3.0
2	bacon	12.0
3	pastrami	6.0
4	corned beef	7.5
5	Bacon	8.0
6	Pastrami	3.0
7	honey ham	5.0
8	nova lox	6.0

meat_to_animal={'bacon':'pig',

               'pulled pork':'pig',

               'pastrami':'cow',

               'corned beef':'cow',

               'honey ham':'pig',

               'nova lox':'salmon'}

pd.Series.str.lower

<function pandas.core.strings._noarg_wrapper.<locals>.wrapper>

str.lower above is a Series method.

lowercased=data['food'].str.lower()

data['animal']=lowercased

data

	food	ounces	animal
0	bacon	4.0	bacon
1	pulled pork	3.0	pulled pork
2	bacon	12.0	bacon
3	pastrami	6.0	pastrami
4	corned beef	7.5	corned beef
5	Bacon	8.0	bacon
6	Pastrami	3.0	pastrami
7	honey ham	5.0	honey ham
8	nova lox	6.0	nova lox

The map() method on a Series accepts a function or dict-like object containing a mapping.Using map() is a convenient way to perform element-wise transformations and other data cleaning related operations.

data['animal']=lowercased.map(meat_to_animal);data

	food	ounces	animal
0	bacon	4.0	pig
1	pulled pork	3.0	pig
2	bacon	12.0	pig
3	pastrami	6.0	cow
4	corned beef	7.5	cow
5	Bacon	8.0	pig
6	Pastrami	3.0	cow
7	honey ham	5.0	pig
8	nova lox	6.0	salmon

We could also have passed a function that does all the work.Such as the following:

data['food'].map(lambda x:meat_to_animal[x.lower()])

0       pig

1       pig

2       pig

3       cow

4       cow

5       pig

6       cow

7       pig

8    salmon

Name: food, dtype: object

Replacing values

data=pd.Series([1,-999,2,-999,-1000,3]);data

0       1

1    -999

2       2

3    -999

4   -1000

5       3

dtype: int64

data.replace(-999,np.nan) # Replcace one value with one value

0       1.0

1       NaN

2       2.0

3       NaN

4   -1000.0

5       3.0

dtype: float64

data.replace([-999,-1000],np.nan) # Replace multi-values with one value

0    1.0

1    NaN

2    2.0

3    NaN

4    NaN

5    3.0

dtype: float64

data.replace([-999,-1000],[np.nan,0])# Replace multi-values with multi-values

0    1.0

1    NaN

2    2.0

3    NaN

4    0.0

5    3.0

dtype: float64

data.replace({-999:np.nan,0-1000:0}) # dict can also be passed into replace method

0    1.0

1    NaN

2    2.0

3    NaN

4    0.0

5    3.0

dtype: float64

data1=pd.Series(['A','B','c',12])

help(data1.str.replace)

Help on method replace in module pandas.core.strings:

replace(pat, repl, n=-1, case=True, flags=0) method of pandas.core.strings.StringMethods instance

    Replace occurrences of pattern/regex in the Series/Index with

    some other string. Equivalent to :meth:`str.replace` or

    :func:`re.sub`.

    Parameters

    ----------

    pat : string

        Character sequence or regular expression

    repl : string

        Replacement sequence

    n : int, default -1 (all)

        Number of replacements to make from start

    case : boolean, default True

        If True, case sensitive

    flags : int, default 0 (no flags)

        re module flags, e.g. re.IGNORECASE

    Returns

    -------

    replaced : Series/Index of objects

Renaming Axis indexes

data=pd.DataFrame(np.arange(12).reshape((3,4)),index=['Ohio','Colorado','New York'],columns=['One','Two','three','Four']);data

	One	Two	three	Four
Ohio	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11

data.index.map(lambda x:x[:4].upper())

array(['OHIO', 'COLO', 'NEW '], dtype=object)

data

	One	Two	three	Four
Ohio	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11

data.index=data.index.map(lambda x:x[:4].upper());data # Modify DataFrame in-place

	One	Two	three	Four
OHIO	0	1	2	3
COLO	4	5	6	7
NEW	8	9	10	11

If you want to create a transformed version of a dataset without modifying the original,a useful method is rename().

data

	One	Two	three	Four
OHIO	0	1	2	3
COLO	4	5	6	7
NEW	8	9	10	11

data.rename(index=str.title,columns=str.upper)

	ONE	TWO	THREE	FOUR
Ohio	0	1	2	3
Colo	4	5	6	7
New	8	9	10	11

data

	One	Two	three	Four
OHIO	0	1	2	3
COLO	4	5	6	7
NEW	8	9	10	11

To modify dataset in-place,pass inplace=True.

data.rename(index={'OHIO':'INDIANA'},inplace=True)

data

	One	Two	three	Four
INDIANA	0	1	2	3
COLO	4	5	6	7
NEW	8	9	10	11

Discretization and Binning

help(pd.cut)

Help on function cut in module pandas.tools.tile:

cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False)

    Return indices of half-open bins to which each value of `x` belongs.

    Parameters

    ----------

    x : array-like

        Input array to be binned. It has to be 1-dimensional.

    bins : int or sequence of scalars

        If `bins` is an int, it defines the number of equal-width bins in the

        range of `x`. However, in this case, the range of `x` is extended

        by .1% on each side to include the min or max values of `x`. If

        `bins` is a sequence it defines the bin edges allowing for

        non-uniform bin width. No extension of the range of `x` is done in

        this case.

    right : bool, optional

        Indicates whether the bins include the rightmost edge or not. If

        right == True (the default), then the bins [1,2,3,4] indicate

        (1,2], (2,3], (3,4].

    labels : array or boolean, default None

        Used as labels for the resulting bins. Must be of the same length as

        the resulting bins. If False, return only integer indicators of the

        bins.

    retbins : bool, optional

        Whether to return the bins or not. Can be useful if bins is given

        as a scalar.

    precision : int

        The precision at which to store and display the bins labels

    include_lowest : bool

        Whether the first interval should be left-inclusive or not.

    Returns

    -------

    out : Categorical or Series or array of integers if labels is False

        The return type (Categorical or Series) depends on the input: a Series

        of type category if input is a Series else Categorical. Bins are

        represented as categories when categorical data is returned.

    bins : ndarray of floats

        Returned only if `retbins` is True.

    Notes

    -----

    The `cut` function can be useful for going from a continuous variable to

    a categorical variable. For example, `cut` could convert ages to groups

    of age ranges.

    Any NA values will be NA in the result.  Out of bounds values will be NA in

    the resulting Categorical object

    Examples

    --------

    >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True)

    ([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533],

      (6.533, 9.7], (0.191, 3.367]]

    Categories (3, object): [(0.191, 3.367] < (3.367, 6.533] < (6.533, 9.7]],

    array([ 0.1905    ,  3.36666667,  6.53333333,  9.7       ]))

    >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3,

               labels=["good","medium","bad"])

    [good, good, good, medium, bad, good]

    Categories (3, object): [good < medium < bad]

    >>> pd.cut(np.ones(5), 4, labels=False)

    array([1, 1, 1, 1, 1], dtype=int64)

ages=[20,22,25,27,21,23,37,31,61,45,41,32]

bins=[18,25,35,60,100]

cats=pd.cut(ages,bins)

cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]

Length: 12

Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

len(ages)

type(cats)

pandas.core.categorical.Categorical

  cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

cats.categories

Index(['(18, 25]', '(25, 35]', '(35, 60]', '(60, 100]'], dtype='object')

type(pd.value_counts(cats))

pandas.core.series.Series

help(pd.value_counts)

Help on function value_counts in module pandas.core.algorithms:

value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True)

    Compute a histogram of the counts of non-null values.

    Parameters

    ----------

    values : ndarray (1-d)

    sort : boolean, default True

        Sort by values

    ascending : boolean, default False

        Sort in ascending order

    normalize: boolean, default False

        If True then compute a relative histogram

    bins : integer, optional

        Rather than count values, group them into half-open bins,

        convenience for pd.cut, only works with numeric data

    dropna : boolean, default True

        Don't include counts of NaN

    Returns

    -------

    value_counts : Series

pd.value_counts([1,1,2,3,4,45,5])

1     2

5     1

45    1

4     1

3     1

2     1

dtype: int64

pd.value_counts(cats)

(18, 25]     5

(35, 60]     3

(25, 35]     3

(60, 100]    1

dtype: int64

You can also pass your bin names by passing a list or array to the labels option.

group_names=['Youth','YoungAdult','MiddleAged','Senior']

pd.cut(ages,bins,labels=group_names) # bin is a reserved key.

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]

Length: 12

Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

help(bin)

Help on built-in function bin in module builtins:

bin(number, /)

    Return the binary representation of an integer.

    >>> bin(2796202)

    '0b1010101010101010101010'

bin(2)

'0b10'

bins can also be an integer, and in that case, the category will be equal-space.

data=np.random.rand(20)

pd.cut(data,4,precision=2)# precision limits the decimal precision to two digits.

[(0.25, 0.5], (0.25, 0.5], (0.25, 0.5], (0.75, 1], (0.5, 0.75], ..., (0.0024, 0.25], (0.25, 0.5], (0.25, 0.5], (0.25, 0.5], (0.0024, 0.25]]

Length: 20

Categories (4, object): [(0.0024, 0.25] < (0.25, 0.5] < (0.5, 0.75] < (0.75, 1]]

A closely related function,qcut,bins the data based on sample quantiles.Using cut will not usually result in each bin having the same number of data points.

data=np.random.randn(1000)

cats=pd.qcut(data,4);cats

[(0.0211, 0.689], (0.689, 3.225], (-0.62, 0.0211], (0.689, 3.225], (0.689, 3.225], ..., (0.689, 3.225], [-3.401, -0.62], (-0.62, 0.0211], (-0.62, 0.0211], (-0.62, 0.0211]]

Length: 1000

Categories (4, object): [[-3.401, -0.62] < (-0.62, 0.0211] < (0.0211, 0.689] < (0.689, 3.225]]

pd.value_counts(cats)

(0.689, 3.225]     250

(0.0211, 0.689]    250

(-0.62, 0.0211]    250

[-3.401, -0.62]    250

dtype: int64

cats1=pd.qcut(data,[0,0.1,0.5,0.9,1])

pd.value_counts(cats1)

(0.0211, 1.33]      400

(-1.201, 0.0211]    400

(1.33, 3.225]       100

[-3.401, -1.201]    100

dtype: int64

Detecting and filtering Outliers

data=pd.DataFrame(np.random.randn(1000,4))

data.describe()

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	0.002634	-0.038263	0.001432	-0.040628
std	0.981600	0.996856	1.021248	1.030675
min	-3.400618	-3.427137	-4.309211	-4.375632
25%	-0.656369	-0.713371	-0.681777	-0.754702
50%	-0.005199	-0.026878	-0.019116	0.005450
75%	0.649159	0.613807	0.690614	0.625859
max	3.408137	3.171119	3.784272	2.992607

col=data[2]

col[np.abs(col)>3]

322    3.059163

431   -3.089013

648   -4.309211

653    3.784272

834    3.007481

Name: 2, dtype: float64

help(pd.DataFrame.any)

Help on function any in module pandas.core.frame:

any(self, axis=None, bool_only=None, skipna=None, level=None, **kwargs)

    Return whether any element is True over requested axis

    Parameters

    ----------

    axis : {index (0), columns (1)}

    skipna : boolean, default True

        Exclude NA/null values. If an entire row/column is NA, the result

        will be NA

    level : int or level name, default None

        If the axis is a MultiIndex (hierarchical), count along a

        particular level, collapsing into a Series

    bool_only : boolean, default None

        Include only boolean columns. If None, will attempt to use everything,

        then use only boolean data. Not implemented for Series.

    Returns

    -------

    any : Series or DataFrame (if level specified)

(abs(data)>3) ==(np.abs(data)>3)

	0	1	2	3
0	True	True	True	True
1	True	True	True	True
2	True	True	True	True
3	True	True	True	True
4	True	True	True	True
5	True	True	True	True
6	True	True	True	True
7	True	True	True	True
8	True	True	True	True
9	True	True	True	True
10	True	True	True	True
11	True	True	True	True
12	True	True	True	True
13	True	True	True	True
14	True	True	True	True
15	True	True	True	True
16	True	True	True	True
17	True	True	True	True
18	True	True	True	True
19	True	True	True	True
20	True	True	True	True
21	True	True	True	True
22	True	True	True	True
23	True	True	True	True
24	True	True	True	True
25	True	True	True	True
26	True	True	True	True
27	True	True	True	True
28	True	True	True	True
29	True	True	True	True
...	...	...	...	...
970	True	True	True	True
971	True	True	True	True
972	True	True	True	True
973	True	True	True	True
974	True	True	True	True
975	True	True	True	True
976	True	True	True	True
977	True	True	True	True
978	True	True	True	True
979	True	True	True	True
980	True	True	True	True
981	True	True	True	True
982	True	True	True	True
983	True	True	True	True
984	True	True	True	True
985	True	True	True	True
986	True	True	True	True
987	True	True	True	True
988	True	True	True	True
989	True	True	True	True
990	True	True	True	True
991	True	True	True	True
992	True	True	True	True
993	True	True	True	True
994	True	True	True	True
995	True	True	True	True
996	True	True	True	True
997	True	True	True	True
998	True	True	True	True
999	True	True	True	True

1000 rows × 4 columns

data[(np.abs(data)>3).any(1)]

	0	1	2	3
59	-3.400618	0.342563	0.649758	-2.629268
274	1.264869	-3.427137	0.991494	-0.906788
322	2.714233	-1.239436	3.059163	0.318054
431	-0.376058	-0.713530	-3.089013	-0.791221
460	0.411801	-0.323974	0.301139	-3.051362
465	0.054043	-1.046532	2.054820	-4.375632
587	0.857067	-3.162763	0.137409	-1.327873
648	-0.323629	0.325867	-4.309211	-0.477572
653	0.171840	0.148702	3.784272	0.269508
678	0.303109	3.171119	0.854269	0.489537
834	1.651314	1.303992	3.007481	0.494971
841	3.408137	0.869413	-0.111245	1.306775
960	-0.302520	-3.118445	2.116509	0.003669

np.sign([0,0.3,-0.3,20,-90])

array([ 0.,  1., -1.,  1., -1.])

data[np.abs(data)>3]=np.sign(data)*3

np.sign(data)*3

	0	1	2	3
0	-3.0	-3.0	-3.0	3.0
1	-3.0	3.0	3.0	-3.0
2	3.0	3.0	3.0	3.0
3	-3.0	-3.0	-3.0	-3.0
4	3.0	-3.0	3.0	-3.0
5	-3.0	-3.0	-3.0	3.0
6	-3.0	-3.0	3.0	3.0
7	3.0	3.0	3.0	3.0
8	-3.0	3.0	-3.0	3.0
9	3.0	-3.0	3.0	3.0
10	-3.0	3.0	-3.0	-3.0
11	-3.0	3.0	3.0	3.0
12	3.0	3.0	3.0	3.0
13	3.0	-3.0	3.0	3.0
14	3.0	3.0	3.0	3.0
15	3.0	3.0	3.0	-3.0
16	3.0	-3.0	3.0	3.0
17	3.0	-3.0	-3.0	3.0
18	-3.0	3.0	3.0	3.0
19	3.0	3.0	3.0	-3.0
20	-3.0	3.0	3.0	3.0
21	3.0	3.0	-3.0	3.0
22	-3.0	3.0	-3.0	-3.0
23	3.0	3.0	-3.0	-3.0
24	3.0	-3.0	3.0	3.0
25	-3.0	-3.0	-3.0	3.0
26	3.0	3.0	-3.0	-3.0
27	3.0	-3.0	-3.0	-3.0
28	3.0	-3.0	-3.0	3.0
29	3.0	3.0	-3.0	-3.0
...	...	...	...	...
970	-3.0	-3.0	3.0	-3.0
971	-3.0	3.0	-3.0	-3.0
972	-3.0	3.0	-3.0	3.0
973	3.0	3.0	3.0	3.0
974	3.0	-3.0	-3.0	3.0
975	-3.0	3.0	-3.0	3.0
976	-3.0	3.0	3.0	3.0
977	-3.0	-3.0	3.0	-3.0
978	3.0	-3.0	-3.0	-3.0
979	-3.0	3.0	-3.0	3.0
980	-3.0	-3.0	-3.0	3.0
981	3.0	3.0	3.0	-3.0
982	-3.0	3.0	-3.0	-3.0
983	-3.0	3.0	-3.0	-3.0
984	3.0	3.0	-3.0	-3.0
985	3.0	3.0	-3.0	3.0
986	-3.0	-3.0	-3.0	3.0
987	-3.0	3.0	-3.0	-3.0
988	3.0	3.0	-3.0	-3.0
989	3.0	-3.0	-3.0	3.0
990	3.0	-3.0	3.0	-3.0
991	3.0	-3.0	3.0	3.0
992	-3.0	3.0	-3.0	-3.0
993	-3.0	3.0	-3.0	3.0
994	3.0	-3.0	-3.0	-3.0
995	3.0	-3.0	-3.0	-3.0
996	3.0	-3.0	3.0	-3.0
997	-3.0	-3.0	-3.0	-3.0
998	3.0	3.0	-3.0	-3.0
999	3.0	3.0	3.0	-3.0

1000 rows × 4 columns

data

	0	1	2	3
0	-0.564062	-0.887969	-0.854782	0.107613
1	-1.364165	1.337851	1.671698	-0.814129
2	0.765877	1.916774	0.441002	2.128419
3	-0.581957	-1.024641	-1.983024	-2.757392
4	0.778034	-1.375845	0.044277	-1.037062
5	-0.796683	-0.540663	-0.120198	0.003503
6	-0.708554	-0.105414	1.037527	0.826310
7	1.233856	1.217529	1.097430	0.842746
8	-0.201433	0.249823	-1.620147	0.436595
9	1.328493	-0.396323	1.927629	1.615656
10	-0.560207	0.252996	-0.151543	-0.667813
11	-1.729057	1.144087	1.087689	0.520086
12	0.704758	1.707940	0.720834	0.447245
13	1.024834	-0.217376	1.340304	0.176801
14	0.075745	1.430761	0.193627	0.191701
15	0.536566	0.047559	1.715175	-1.115074
16	2.803965	-0.465377	1.127140	1.417856
17	0.677525	-1.091631	-0.572231	0.241533
18	-1.172228	1.049830	0.266288	0.836902
19	0.930699	0.379891	1.637741	-1.770379
20	-0.749769	0.711326	1.591292	1.099071
21	1.550585	1.276488	-0.214484	0.195340
22	-0.289236	1.882439	-0.275263	-0.247316
23	0.688167	0.357913	-1.675828	-0.305840
24	1.255532	-1.802804	0.889900	0.864982
25	-1.391447	-0.291022	-0.190022	0.540653
26	0.435101	2.444416	-1.235937	-0.428450
27	0.165456	-1.091942	-1.560662	-0.739435
28	1.469728	-0.123806	-2.071746	2.574603
29	1.287949	1.278130	-0.825906	-1.852465
...	...	...	...	...
970	-0.379102	-0.778606	2.213794	-0.062573
971	-1.108557	0.723650	-2.436704	-0.068733
972	-0.518995	0.455508	-0.217321	1.363977
973	0.444636	1.625221	0.222103	1.236397
974	0.699354	-2.076747	-0.454499	0.383902
975	-1.759718	0.717117	-0.077413	1.698893
976	-1.230778	0.222673	0.151731	0.174875
977	-0.575290	-0.316810	0.380077	-0.048428
978	1.906133	-0.861802	-0.026937	-2.865641
979	-0.134489	0.607949	-0.821089	0.831827
980	-0.058894	-0.707492	-0.273980	0.129724
981	2.288519	0.149683	0.580679	-0.055218
982	-0.280748	0.861358	-0.254339	-0.596723
983	-1.322965	0.323534	-0.585862	-1.316894
984	0.793711	0.165646	-0.212855	-1.752453
985	0.310908	0.758156	-0.040923	0.538293
986	-0.589173	-1.688947	-0.501485	0.019880
987	-0.111807	1.007026	-0.853133	-0.249211
988	0.601993	0.690953	-1.168277	-0.516737
989	1.319895	-0.046141	-0.680194	1.443361
990	1.839785	-0.480675	0.056481	-0.097993
991	2.590916	-0.367057	1.110105	0.130826
992	-0.108846	1.717209	-0.580895	-0.985869
993	-1.152810	0.390732	-0.104866	1.553947
994	1.721177	-0.088994	-0.565308	-1.602808
995	0.922409	-0.027923	-1.258001	-1.933848
996	0.647699	-0.089378	1.455509	-0.598519
997	-1.590236	-0.544202	-0.764923	-0.329425
998	0.969542	0.106538	-0.188919	-1.474017
999	0.235337	0.232514	0.113181	-1.403455

1000 rows × 4 columns

np.sign(data).head(10) # return the first 10 rows.

	0	1	2	3
0	-1.0	-1.0	-1.0	1.0
1	-1.0	1.0	1.0	-1.0
2	1.0	1.0	1.0	1.0
3	-1.0	-1.0	-1.0	-1.0
4	1.0	-1.0	1.0	-1.0
5	-1.0	-1.0	-1.0	1.0
6	-1.0	-1.0	1.0	1.0
7	1.0	1.0	1.0	1.0
8	-1.0	1.0	-1.0	1.0
9	1.0	-1.0	1.0	1.0

Permutation and random sample

df=pd.DataFrame(np.arange(20).reshape((5,4)))

sampler=np.random.permutation(5);sampler

array([4, 3, 1, 2, 0])

df.take(sampler)

	0	1	2	3
4	16	17	18	19
3	12	13	14	15
1	4	5	6	7
2	8	9	10	11
0	0	1	2	3

df.sample(n=4)

	0	1	2	3
2	8	9	10	11
0	0	1	2	3
1	4	5	6	7
4	16	17	18	19

df.sample(n=10,replace=True) # replace allows repeat choices.

	0	1	2	3
1	4	5	6	7
2	8	9	10	11
1	4	5	6	7
2	8	9	10	11
0	0	1	2	3
3	12	13	14	15
3	12	13	14	15
2	8	9	10	11
0	0	1	2	3
4	16	17	18	19

choices=pd.Series([5,7,-1,6,4])

choices.sample(n=10,replace=True)

2   -1

4    4

1    7

0    5

0    5

3    6

4    4

2   -1

1    7

1    7

dtype: int64

Computing indicator/Dummy variables

df=pd.DataFrame({'Key':['b','b','a','c','a','b'],'data1':range(6)});df

	Key	data1
0	b	0
1	b	1
2	a	2
3	c	3
4	a	4
5	b	5

pd.get_dummies(df['Key'])

	a	b	c
0	0	1	0
1	0	1	0
2	1	0	0
3	0	0	1
4	1	0	0
5	0	1	0

pd.get_dummies(df['Key'],prefix='key')

	key_a	key_b	key_c
0	0	1	0
1	0	1	0
2	1	0	0
3	0	0	1
4	1	0	0
5	0	1	0

df[['data1']]

	data1
0	0
1	1
2	2
3	3
4	4
5	5

df['data1']

0    0

1    1

2    2

3    3

4    4

5    5

Name: data1, dtype: int32

so the difference between df[['data1']] and df['data1'] is apparent, the former one returns DataFrame,the latter one returns Series.