Data wrangling:Join,Combine,and Reshape,in Pandas

import pandas as pd

import numpy as np

Hierarchical indexing

data=pd.Series(np.random.randn(9),index=[['a','a','a','b','b','c','c','d','d'],[1,2,3,1,3,1,2,2,3]]);data

a  1   -0.396969

   2   -0.348014

   3   -1.340860

b  1   -0.502245

   3    0.640700

c  1    0.063639

   2    1.290096

d  2   -0.003899

   3    0.541342

dtype: float64

data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],

           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])

In labels above,[0,0,0,1,1,2,2,3,3] represents the outer layer's label,0 represents 'a',1 represents 'b',2 represents 'c', 3 represents 'd'.

[0,1,2,0,2,0,1,1,2] represents the inner layer's label,0 represents 1 in data's inner index, 1-->2,2-->3.

data['b']

1   -0.502245

3    0.640700

dtype: float64

data['b':'c']

b  1   -0.502245

   3    0.640700

c  1    0.063639

   2    1.290096

dtype: float64

data.loc[['b','d']] # data['b','d'] is wrong,because in this case,'b' shall be index,'d' shall be columns.

b  1   -0.502245

   3    0.640700

d  2   -0.003899

   3    0.541342

dtype: float64

data.loc['b','d']

---------------------------------------------------------------------------

IndexingError                             Traceback (most recent call last)

<ipython-input-8-f6a5fae3fedc> in <module>()

----> 1 data.loc['b','d']

D:\Anaconda\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)

   1470             except (KeyError, IndexError):

   1471                 pass

-> 1472             return self._getitem_tuple(key)

   1473         else:

   1474             # we by definition only have the 0th axis

D:\Anaconda\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)

    873

    874         # no multi-index, so validate all of the indexers

--> 875         self._has_valid_tuple(tup)

    876

    877         # ugly hack for GH #836

D:\Anaconda\lib\site-packages\pandas\core\indexing.py in _has_valid_tuple(self, key)

    218         for i, k in enumerate(key):

    219             if i >= self.obj.ndim:

--> 220                 raise IndexingError('Too many indexers')

    221             try:

    222                 self._validate_key(k, i)

IndexingError: Too many indexers

data.loc[:,2]

a   -0.348014

c    1.290096

d   -0.003899

dtype: float64

data.unstack()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	1	2	3
a	-0.396969	-0.348014	-1.340860
b	-0.502245	NaN	0.640700
c	0.063639	1.290096	NaN
d	NaN	-0.003899	0.541342

data.unstack().stack()

a  1   -0.396969

   2   -0.348014

   3   -1.340860

b  1   -0.502245

   3    0.640700

c  1    0.063639

   2    1.290096

d  2   -0.003899

   3    0.541342

dtype: float64

With a DataFrame,either axis can have a hierarchical index.

frame=pd.DataFrame(np.arange(12).reshape((4,3)),index=[['a','a','b','b'],[1,2,1,2]],columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])

frame

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead tr th { text-align: left }

		Ohio		Colorado
		Green	Red	Green
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

frame.index.names=['Key1','Key2']

frame.columns.names=['state','color']

frame

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead tr th { text-align: left }
.dataframe thead tr:last-of-type th { text-align: right }

	state	Ohio		Colorado
	color	Green	Red	Green
Key1	Key2
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

frame['Ohio']

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	color	Green	Red
Key1	Key2
a	1	0	1
a	2	3	4
b	1	6	7
b	2	9	10

frame['Ohio']['Red']

Key1  Key2

a     1        1

      2        4

b     1        7

      2       10

Name: Red, dtype: int32

frame[('a',1):('b',1)]

	state	Ohio		Colorado
	color	Green	Red	Green
Key1	Key2
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8

frame[('a',1):('b',1)]['Ohio']

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	color	Green	Red
Key1	Key2
a	1	0	1
a	2	3	4
b	1	6	7

frame[('a',1):('b',1)]['Ohio']['Red']

Key1  Key2

a     1       1

      2       4

b     1       7

Name: Red, dtype: int32

Notice the level[] after another [] until to the specified column.

help(frame.loc)

Help on _LocIndexer in module pandas.core.indexing object:

class _LocIndexer(_LocationIndexer)

 |  Access a group of rows and columns by label(s) or a boolean array.

 |

 |  ``.loc[]`` is primarily label based, but may also be used with a

 |  boolean array.

 |

 |  Allowed inputs are:

 |

 |  - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is

 |    interpreted as a *label* of the index, and **never** as an

 |    integer position along the index).

 |  - A list or array of labels, e.g. ``['a', 'b', 'c']``.

 |  - A slice object with labels, e.g. ``'a':'f'``.

 |

 |    .. warning:: Note that contrary to usual python slices, **both** the

 |        start and the stop are included

 |

 |  - A boolean array of the same length as the axis being sliced,

 |    e.g. ``[True, False, True]``.

 |  - A ``callable`` function with one argument (the calling Series, DataFrame

 |    or Panel) and that returns valid output for indexing (one of the above)

 |

 |  See more at :ref:`Selection by Label <indexing.label>`

 |

 |  See Also

 |  --------

 |  DataFrame.at : Access a single value for a row/column label pair

 |  DataFrame.iloc : Access group of rows and columns by integer position(s)

 |  DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the

 |      Series/DataFrame.

 |  Series.loc : Access group of values using labels

 |

 |  Examples

 |  --------

 |  **Getting values**

 |

 |  >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],

 |  ...      index=['cobra', 'viper', 'sidewinder'],

 |  ...      columns=['max_speed', 'shield'])

 |  >>> df

 |              max_speed  shield

 |  cobra               1       2

 |  viper               4       5

 |  sidewinder          7       8

 |

 |  Single label. Note this returns the row as a Series.

 |

 |  >>> df.loc['viper']

 |  max_speed    4

 |  shield       5

 |  Name: viper, dtype: int64

 |

 |  List of labels. Note using ``[[]]`` returns a DataFrame.

 |

 |  >>> df.loc[['viper', 'sidewinder']]

 |              max_speed  shield

 |  viper               4       5

 |  sidewinder          7       8

 |

 |  Single label for row and column

 |

 |  >>> df.loc['cobra', 'shield']

 |  2

 |

 |  Slice with labels for row and single label for column. As mentioned

 |  above, note that both the start and stop of the slice are included.

 |

 |  >>> df.loc['cobra':'viper', 'max_speed']

 |  cobra    1

 |  viper    4

 |  Name: max_speed, dtype: int64

 |

 |  Boolean list with the same length as the row axis

 |

 |  >>> df.loc[[False, False, True]]

 |              max_speed  shield

 |  sidewinder          7       8

 |

 |  Conditional that returns a boolean Series

 |

 |  >>> df.loc[df['shield'] > 6]

 |              max_speed  shield

 |  sidewinder          7       8

 |

 |  Conditional that returns a boolean Series with column labels specified

 |

 |  >>> df.loc[df['shield'] > 6, ['max_speed']]

 |              max_speed

 |  sidewinder          7

 |

 |  Callable that returns a boolean Series

 |

 |  >>> df.loc[lambda df: df['shield'] == 8]

 |              max_speed  shield

 |  sidewinder          7       8

 |

 |  **Setting values**

 |

 |  Set value for all items matching the list of labels

 |

 |  >>> df.loc[['viper', 'sidewinder'], ['shield']] = 50

 |  >>> df

 |              max_speed  shield

 |  cobra               1       2

 |  viper               4      50

 |  sidewinder          7      50

 |

 |  Set value for an entire row

 |

 |  >>> df.loc['cobra'] = 10

 |  >>> df

 |              max_speed  shield

 |  cobra              10      10

 |  viper               4      50

 |  sidewinder          7      50

 |

 |  Set value for an entire column

 |

 |  >>> df.loc[:, 'max_speed'] = 30

 |  >>> df

 |              max_speed  shield

 |  cobra              30      10

 |  viper              30      50

 |  sidewinder         30      50

 |

 |  Set value for rows matching callable condition

 |

 |  >>> df.loc[df['shield'] > 35] = 0

 |  >>> df

 |              max_speed  shield

 |  cobra              30      10

 |  viper               0       0

 |  sidewinder          0       0

 |

 |  **Getting values on a DataFrame with an index that has integer labels**

 |

 |  Another example using integers for the index

 |

 |  >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],

 |  ...      index=[7, 8, 9], columns=['max_speed', 'shield'])

 |  >>> df

 |     max_speed  shield

 |  7          1       2

 |  8          4       5

 |  9          7       8

 |

 |  Slice with integer labels for rows. As mentioned above, note that both

 |  the start and stop of the slice are included.

 |

 |  >>> df.loc[7:9]

 |     max_speed  shield

 |  7          1       2

 |  8          4       5

 |  9          7       8

 |

 |  **Getting values with a MultiIndex**

 |

 |  A number of examples using a DataFrame with a MultiIndex

 |

 |  >>> tuples = [

 |  ...    ('cobra', 'mark i'), ('cobra', 'mark ii'),

 |  ...    ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'),

 |  ...    ('viper', 'mark ii'), ('viper', 'mark iii')

 |  ... ]

 |  >>> index = pd.MultiIndex.from_tuples(tuples)

 |  >>> values = [[12, 2], [0, 4], [10, 20],

 |  ...         [1, 4], [7, 1], [16, 36]]

 |  >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index)

 |  >>> df

 |                       max_speed  shield

 |  cobra      mark i           12       2

 |             mark ii           0       4

 |  sidewinder mark i           10      20

 |             mark ii           1       4

 |  viper      mark ii           7       1

 |             mark iii         16      36

 |

 |  Single label. Note this returns a DataFrame with a single index.

 |

 |  >>> df.loc['cobra']

 |           max_speed  shield

 |  mark i          12       2

 |  mark ii          0       4

 |

 |  Single index tuple. Note this returns a Series.

 |

 |  >>> df.loc[('cobra', 'mark ii')]

 |  max_speed    0

 |  shield       4

 |  Name: (cobra, mark ii), dtype: int64

 |

 |  Single label for row and column. Similar to passing in a tuple, this

 |  returns a Series.

 |

 |  >>> df.loc['cobra', 'mark i']

 |  max_speed    12

 |  shield        2

 |  Name: (cobra, mark i), dtype: int64

 |

 |  Single tuple. Note using ``[[]]`` returns a DataFrame.

 |

 |  >>> df.loc[[('cobra', 'mark ii')]]

 |                 max_speed  shield

 |  cobra mark ii          0       4

 |

 |  Single tuple for the index with a single label for the column

 |

 |  >>> df.loc[('cobra', 'mark i'), 'shield']

 |  2

 |

 |  Slice from index tuple to single label

 |

 |  >>> df.loc[('cobra', 'mark i'):'viper']

 |                       max_speed  shield

 |  cobra      mark i           12       2

 |             mark ii           0       4

 |  sidewinder mark i           10      20

 |             mark ii           1       4

 |  viper      mark ii           7       1

 |             mark iii         16      36

 |

 |  Slice from index tuple to index tuple

 |

 |  >>> df.loc[('cobra', 'mark i'):('viper', 'mark ii')]

 |                      max_speed  shield

 |  cobra      mark i          12       2

 |             mark ii          0       4

 |  sidewinder mark i          10      20

 |             mark ii          1       4

 |  viper      mark ii          7       1

 |

 |  Raises

 |  ------

 |  KeyError:

 |      when any items are not found

 |

 |  Method resolution order:

 |      _LocIndexer

 |      _LocationIndexer

 |      _NDFrameIndexer

 |      pandas._libs.indexing._NDFrameIndexerBase

 |      builtins.object

 |

 |  Methods inherited from _LocationIndexer:

 |

 |  __getitem__(self, key)

 |

 |  ----------------------------------------------------------------------

 |  Methods inherited from _NDFrameIndexer:

 |

 |  __call__(self, axis=None)

 |      Call self as a function.

 |

 |  __iter__(self)

 |

 |  __setitem__(self, key, value)

 |

 |  ----------------------------------------------------------------------

 |  Data descriptors inherited from _NDFrameIndexer:

 |

 |  __dict__

 |      dictionary for instance variables (if defined)

 |

 |  __weakref__

 |      list of weak references to the object (if defined)

 |

 |  ----------------------------------------------------------------------

 |  Data and other attributes inherited from _NDFrameIndexer:

 |

 |  axis = None

 |

 |  ----------------------------------------------------------------------

 |  Methods inherited from pandas._libs.indexing._NDFrameIndexerBase:

 |

 |  __init__(self, /, *args, **kwargs)

 |      Initialize self.  See help(type(self)) for accurate signature.

 |

 |  __reduce__ = __reduce_cython__(...)

 |

 |  __setstate__ = __setstate_cython__(...)

 |

 |  ----------------------------------------------------------------------

 |  Static methods inherited from pandas._libs.indexing._NDFrameIndexerBase:

 |

 |  __new__(*args, **kwargs) from builtins.type

 |      Create and return a new object.  See help(type) for accurate signature.

 |

 |  ----------------------------------------------------------------------

 |  Data descriptors inherited from pandas._libs.indexing._NDFrameIndexerBase:

 |

 |  name

 |

 |  ndim

 |

 |  obj

pd.MultiIndex.from_arrays([['Ohio','Ohio','Colorado'],['Green','Red','Green']],names=['state','color'])

MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],

           labels=[[1, 1, 0], [0, 1, 0]],

           names=['state', 'color'])

Reodering and Sorting levels

help(frame.swaplevel)

Help on method swaplevel in module pandas.core.frame:

swaplevel(i=-2, j=-1, axis=0) method of pandas.core.frame.DataFrame instance

    Swap levels i and j in a MultiIndex on a particular axis

    Parameters

    ----------

    i, j : int, string (can be mixed)

        Level of index to be swapped. Can pass level name as string.

    Returns

    -------

    swapped : type of caller (new object)

    .. versionchanged:: 0.18.1

       The indexes ``i`` and ``j`` are now optional, and default to

       the two innermost levels of the index.

frame.swaplevel('Key1','Key2')

	state	Ohio		Colorado
	color	Green	Red	Green
Key2	Key1
1	a	0	1	2
2	a	3	4	5
1	b	6	7	8
2	b	9	10	11

help(frame.sort_index)

Help on method sort_index in module pandas.core.frame:

sort_index(axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True, by=None) method of pandas.core.frame.DataFrame instance

    Sort object by labels (along an axis)

    Parameters

    ----------

    axis : index, columns to direct sorting

    level : int or level name or list of ints or list of level names

        if not None, sort on values in specified index level(s)

    ascending : boolean, default True

        Sort ascending vs. descending

    inplace : bool, default False

        if True, perform operation in-place

    kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'

         Choice of sorting algorithm. See also ndarray.np.sort for more

         information.  `mergesort` is the only stable algorithm. For

         DataFrames, this option is only applied when sorting on a single

         column or label.

    na_position : {'first', 'last'}, default 'last'

         `first` puts NaNs at the beginning, `last` puts NaNs at the end.

         Not implemented for MultiIndex.

    sort_remaining : bool, default True

        if true and sorting by level and index is multilevel, sort by other

        levels too (in order) after sorting by specified level

    Returns

    -------

    sorted_obj : DataFrame

frame.sort_index(level=0)

	state	Ohio		Colorado
	color	Green	Red	Green
Key1	Key2
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

frame.sort_index(level=1)

	state	Ohio		Colorado
	color	Green	Red	Green
Key1	Key2
a	1	0	1	2
b	1	6	7	8
a	2	3	4	5
b	2	9	10	11

frame.sort_index(level='Key2')

	state	Ohio		Colorado
	color	Green	Red	Green
Key1	Key2
a	1	0	1	2
b	1	6	7	8
a	2	3	4	5
b	2	9	10	11

frame.swaplevel(0,1).sort_index(level=0 )

	state	Ohio		Colorado
	color	Green	Red	Green
Key2	Key1
1	a	0	1	2
1	b	6	7	8
2	a	3	4	5
2	b	9	10	11

Summary statistics by level

frame

	state	Ohio		Colorado
	color	Green	Red	Green
Key1	Key2
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

frame.sum(level='Key2')

state	Ohio		Colorado
color	Green	Red	Green
Key2
1	6	8	10
2	12	14	16

frame.sum(level='Key1')

state	Ohio		Colorado
color	Green	Red	Green
Key1
a	3	5	7
b	15	17	19

frame.sum(level='color',axis=1)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	color	Green	Red
Key1	Key2
a	1	2	1
a	2	8	4
b	1	14	7
b	2	20	10

Indexing with a DataFrame's columns

frame=pd.DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':[0,1,2,0,1,2,3]})

frame

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	a	b	c	d
0	0	7	one	0
1	1	6	one	1
2	2	5	one	2
3	3	4	two	0
4	4	3	two	1
5	5	2	two	2
6	6	1	two	3

frame2=frame.set_index(['c','d']);frame2

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

		a	b
c	d
one	0	0	7
	1	1	6
	2	2	5
two	0	3	4
	1	4	3
	2	5	2
	3	6	1

By default,the columns are removed from the DataFrame,though you can leave them in:

frame.set_index(['c','d'],drop=False)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

		a	b	c	d
c	d
one	0	0	7	one	0
	1	1	6	one	1
	2	2	5	one	2
two	0	3	4	two	0
	1	4	3	two	1
	2	5	2	two	2
	3	6	1	two	3

reset_index does the opposite of set_index.

frame2.reset_index()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	c	d	a	b
0	one	0	0	7
1	one	1	1	6
2	one	2	2	5
3	two	0	3	4
4	two	1	4	3
5	two	2	5	2
6	two	3	6	1

Combining and merging datasets

Database-style DataFrame joins

help(pd.merge)

Help on function merge in module pandas.core.reshape.merge:

merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)

    Merge DataFrame objects by performing a database-style join operation by

    columns or indexes.

    If joining columns on columns, the DataFrame indexes *will be

    ignored*. Otherwise if joining indexes on indexes or indexes on a column or

    columns, the index will be passed on.

    Parameters

    ----------

    left : DataFrame

    right : DataFrame

    how : {'left', 'right', 'outer', 'inner'}, default 'inner'

        * left: use only keys from left frame, similar to a SQL left outer join;

          preserve key order

        * right: use only keys from right frame, similar to a SQL right outer join;

          preserve key order

        * outer: use union of keys from both frames, similar to a SQL full outer

          join; sort keys lexicographically

        * inner: use intersection of keys from both frames, similar to a SQL inner

          join; preserve the order of the left keys

    on : label or list

        Column or index level names to join on. These must be found in both

        DataFrames. If `on` is None and not merging on indexes then this defaults

        to the intersection of the columns in both DataFrames.

    left_on : label or list, or array-like

        Column or index level names to join on in the left DataFrame. Can also

        be an array or list of arrays of the length of the left DataFrame.

        These arrays are treated as if they are columns.

    right_on : label or list, or array-like

        Column or index level names to join on in the right DataFrame. Can also

        be an array or list of arrays of the length of the right DataFrame.

        These arrays are treated as if they are columns.

    left_index : boolean, default False

        Use the index from the left DataFrame as the join key(s). If it is a

        MultiIndex, the number of keys in the other DataFrame (either the index

        or a number of columns) must match the number of levels

    right_index : boolean, default False

        Use the index from the right DataFrame as the join key. Same caveats as

        left_index

    sort : boolean, default False

        Sort the join keys lexicographically in the result DataFrame. If False,

        the order of the join keys depends on the join type (how keyword)

    suffixes : 2-length sequence (tuple, list, ...)

        Suffix to apply to overlapping column names in the left and right

        side, respectively

    copy : boolean, default True

        If False, do not copy data unnecessarily

    indicator : boolean or string, default False

        If True, adds a column to output DataFrame called "_merge" with

        information on the source of each row.

        If string, column with information on source of each row will be added to

        output DataFrame, and column will be named value of string.

        Information column is Categorical-type and takes on a value of "left_only"

        for observations whose merge key only appears in 'left' DataFrame,

        "right_only" for observations whose merge key only appears in 'right'

        DataFrame, and "both" if the observation's merge key is found in both.

    validate : string, default None

        If specified, checks if merge is of specified type.

        * "one_to_one" or "1:1": check if merge keys are unique in both

          left and right datasets.

        * "one_to_many" or "1:m": check if merge keys are unique in left

          dataset.

        * "many_to_one" or "m:1": check if merge keys are unique in right

          dataset.

        * "many_to_many" or "m:m": allowed, but does not result in checks.

        .. versionadded:: 0.21.0

    Notes

    -----

    Support for specifying index levels as the `on`, `left_on`, and

    `right_on` parameters was added in version 0.23.0

    Examples

    --------

    >>> A              >>> B

        lkey value         rkey value

    0   foo  1         0   foo  5

    1   bar  2         1   bar  6

    2   baz  3         2   qux  7

    3   foo  4         3   bar  8

    >>> A.merge(B, left_on='lkey', right_on='rkey', how='outer')

       lkey  value_x  rkey  value_y

    0  foo   1        foo   5

    1  foo   4        foo   5

    2  bar   2        bar   6

    3  bar   2        bar   8

    4  baz   3        NaN   NaN

    5  NaN   NaN      qux   7

    Returns

    -------

    merged : DataFrame

        The output type will the be same as 'left', if it is a subclass

        of DataFrame.

    See also

    --------

    merge_ordered

    merge_asof

    DataFrame.join

df1=pd.DataFrame({'key':['b','b','a','c','a','a','b'],'data1':range(7)});df1

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key	data1
0	b	0
1	b	1
2	a	2
3	c	3
4	a	4
5	a	5
6	b	6

df2=pd.DataFrame({'key':['a','b','d'],'data2':range(3)});df2

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key	data2
0	a	0
1	b	1
2	d	2

pd.merge(df1,df2)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key	data1	data2
0	b	0	1
1	b	1	1
2	b	6	1
3	a	2	0
4	a	4	0
5	a	5	0

By default,how is 'inner',meaning result is intersection. It's a good pratice to specify explicitly which column to join on using on.

pd.merge(df1,df2,on='key')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key	data1	data2
0	b	0	1
1	b	1	1
2	b	6	1
3	a	2	0
4	a	4	0
5	a	5	0

If the column names are different in each object,you can specify them separately.

df3=pd.DataFrame({'lkey':['b','b','a','c','a','a','b'],'data1':range(7)});df3

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	lkey	data1
0	b	0
1	b	1
2	a	2
3	c	3
4	a	4
5	a	5
6	b	6

df4=pd.DataFrame({'rkey':['a','b','d'],'data2':range(3)});df4

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	rkey	data2
0	a	0
1	b	1
2	d	2

pd.merge(df3,df4,left_on='lkey',right_on='rkey')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	lkey	data1	rkey	data2
0	b	0	b	1
1	b	1	b	1
2	b	6	b	1
3	a	2	a	0
4	a	4	a	0
5	a	5	a	0

pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='outer')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	lkey	data1	rkey	data2
0	b	0.0	b	1.0
1	b	1.0	b	1.0
2	b	6.0	b	1.0
3	a	2.0	a	0.0
4	a	4.0	a	0.0
5	a	5.0	a	0.0
6	c	3.0	NaN	NaN
7	NaN	NaN	d	2.0

pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='left')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	lkey	data1	rkey	data2
0	b	0	b	1.0
1	b	1	b	1.0
2	a	2	a	0.0
3	c	3	NaN	NaN
4	a	4	a	0.0
5	a	5	a	0.0
6	b	6	b	1.0

pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='right')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	lkey	data1	rkey	data2
0	b	0.0	b	1
1	b	1.0	b	1
2	b	6.0	b	1
3	a	2.0	a	0
4	a	4.0	a	0
5	a	5.0	a	0
6	NaN	NaN	d	2

Above, the many-to-one case has been demonstrated,and that means in pd.merge(df3,df4),values in column 'key' of df4 are all unique.Now,in terms of many-to-many,which means values in column 'key' of df4 are not unique,it forms the Cartesian product of rows.

df1=pd.DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)});df1

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key	data1
0	b	0
1	b	1
2	a	2
3	c	3
4	a	4
5	b	5

df2=pd.DataFrame({'key':['a','b','a','b','d'],'data2':range(5)});df2

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key	data2
0	a	0
1	b	1
2	a	2
3	b	3
4	d	4

pd.merge(df1,df2,on='key',how='left')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key	data1	data2
0	b	0	1.0
1	b	0	3.0
2	b	1	1.0
3	b	1	3.0
4	a	2	0.0
5	a	2	2.0
6	c	3	NaN
7	a	4	0.0
8	a	4	2.0
9	b	5	1.0
10	b	5	3.0

pd.merge(df1,df2,how='inner')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key	data1	data2
0	b	0	1
1	b	0	3
2	b	1	1
3	b	1	3
4	b	5	1
5	b	5	3
6	a	2	0
7	a	2	2
8	a	4	0
9	a	4	2

To merge with multiple keys,pass a list of column names.

left=pd.DataFrame({'key1':['foo','foo','bar'],

                  'key2':['one','two','one'],

                  'lval':[1,2,3]});left

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key1	key2	lval
0	foo	one	1
1	foo	two	2
2	bar	one	3

right=pd.DataFrame({'key1':['foo','foo','bar','bar'],

                   'key2':['one','one','one','one'],

                   'rval':[4,5,6,7]});right

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key1	key2	rval
0	foo	one	4
1	foo	one	5
2	bar	one	6
3	bar	one	7

pd.merge(left,right,on=['key1','key2'],how='outer')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key1	key2	lval	rval
0	foo	one	1	4.0
1	foo	one	1	5.0
2	foo	two	2	NaN
3	bar	one	3	6.0
4	bar	one	3	7.0

A last issue to consider in merge operations is the treatment of overlapping column names;merge has a suffixes option for specifying strings to append to overlapping names in the left and right DataFrame objects.

pd.merge(left,right,on='key1')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key1	key2_x	lval	key2_y	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	one	7

pd.merge(left,right,on='key1',suffixes=('_left','_right'))

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key1	key2_left	lval	key2_right	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	one	7

Merging on Index

In some cases, the merge key(s) in a DataFrame will be found in its index. In this case,you can pass left_index=True or right_index=True(or both) to indicate that the index should be used as the merge key.

left1=pd.DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)});left1

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key	value
0	a	0
1	b	1
2	a	2
3	a	3
4	b	4
5	c	5

right1=pd.DataFrame({'group_val':[3.5,7]},index=['a','b']);right1

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	group_val
a	3.5
b	7.0

pd.merge(left1,right1,left_on='key',right_index=True)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key	value	group_val
0	a	0	3.5
2	a	2	3.5
3	a	3	3.5
1	b	1	7.0
4	b	4	7.0

pd.merge(left1,right1,left_on='key',right_index=True,how='outer')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key	value	group_val
0	a	0	3.5
2	a	2	3.5
3	a	3	3.5
1	b	1	7.0
4	b	4	7.0
5	c	5	NaN

With hierachically indexed data,things are more complicated,as joining on index is implicitly a multiple-key merge:

lefth=pd.DataFrame({'key1':['Ohio','Ohio','Ohio','Nevada','Devada'],

                   'key2':[2000,2001,2002,2001,2002],

                   'data':np.arange(5)});lefth

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key1	key2	data
0	Ohio	2000	0
1	Ohio	2001	1
2	Ohio	2002	2
3	Nevada	2001	3
4	Devada	2002	4

righth=pd.DataFrame(np.arange(12).reshape((6,2)),index=[['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],

                                                       [2001,2000,2000,2000,2001,2002]],

                   columns=['event1','event2']);righth

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

		event1	event2
Nevada	2001	0	1
Nevada	2000	2	3
Ohio	2000	4	5
	2000	6	7
	2001	8	9
	2002	10	11

pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key1	key2	data	event1	event2
0	Ohio	2000	0	4	5
0	Ohio	2000	0	6	7
1	Ohio	2001	1	8	9
2	Ohio	2002	2	10	11
3	Nevada	2001	3	0	1

pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True,how='outer')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key1	key2	data	event1	event2
0	Ohio	2000	0.0	4.0	5.0
0	Ohio	2000	0.0	6.0	7.0
1	Ohio	2001	1.0	8.0	9.0
2	Ohio	2002	2.0	10.0	11.0
3	Nevada	2001	3.0	0.0	1.0
4	Devada	2002	4.0	NaN	NaN
4	Nevada	2000	NaN	2.0	3.0

Using the indexes of both sides of the merge is also possible.

left2=pd.DataFrame([[1,2],[3,4],[5,6]],

                  index=['a','b','c'],

                  columns=['Ohio','Nevada']);left2

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	Ohio	Nevada
a	1	2
b	3	4
c	5	6

right2=pd.DataFrame([[7,8],[9,10],[11,12],[13,14]],

                   index=['b','c','d','e'],

                   columns=['Missori','Alabama']);right2

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	Missori	Alabama
b	7	8
c	9	10
d	11	12
e	13	14

pd.merge(left2,right2,how='outer',left_index=True,right_index=True)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	Ohio	Nevada	Missori	Alabama
a	1.0	2.0	NaN	NaN
b	3.0	4.0	7.0	8.0
c	5.0	6.0	9.0	10.0
d	NaN	NaN	11.0	12.0
e	NaN	NaN	13.0	14.0

DataFrame has a convenient join instance for merging by index.It can also be used to combine together many DataFrame objects having the same or similiar indexes but non-overlapping columns.

left2.join(right2,how='outer')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	Ohio	Nevada	Missori	Alabama
a	1.0	2.0	NaN	NaN
b	3.0	4.0	7.0	8.0
c	5.0	6.0	9.0	10.0
d	NaN	NaN	11.0	12.0
e	NaN	NaN	13.0	14.0

left1

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key	value
0	a	0
1	b	1
2	a	2
3	a	3
4	b	4
5	c	5

right1

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	group_val
a	3.5
b	7.0

left1.join(right1,on='key')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key	value	group_val
0	a	0	3.5
1	b	1	7.0
2	a	2	3.5
3	a	3	3.5
4	b	4	7.0
5	c	5	NaN

another=pd.DataFrame([[7,8],[9,10],[11,12],[16,17]],index=['a','c','e','f'],columns=['New York','Oregon']);another

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	New York	Oregon
a	7	8
c	9	10
e	11	12
f	16	17

left2

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	Ohio	Nevada
a	1	2
b	3	4
c	5	6

right2

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	Missori	Alabama
b	7	8
c	9	10
d	11	12
e	13	14

left2.join([right2,another])

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	Ohio	Nevada	Missori	Alabama	New York	Oregon
a	1	2	NaN	NaN	7.0	8.0
b	3	4	7.0	8.0	NaN	NaN
c	5	6	9.0	10.0	9.0	10.0

left2.join([right2,another],how='outer')

D:\Anaconda\lib\site-packages\pandas\core\frame.py:6369: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  verify_integrity=True)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	Ohio	Nevada	Missori	Alabama	New York	Oregon
a	1.0	2.0	NaN	NaN	7.0	8.0
b	3.0	4.0	7.0	8.0	NaN	NaN
c	5.0	6.0	9.0	10.0	9.0	10.0
d	NaN	NaN	11.0	12.0	NaN	NaN
e	NaN	NaN	13.0	14.0	11.0	12.0
f	NaN	NaN	NaN	NaN	16.0	17.0

Concatenating along an axis

Another kind of data combination operation is referred to interchangebly as concatenation,binding or stacking.Numpy's concatenate can do this with Numpy arrays.

arr=np.arange(12).reshape((3,4));arr

array([[ 0,  1,  2,  3],

       [ 4,  5,  6,  7],

       [ 8,  9, 10, 11]])

np.concatenate([arr,arr],axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],

       [ 4,  5,  6,  7,  4,  5,  6,  7],

       [ 8,  9, 10, 11,  8,  9, 10, 11]])

np.concatenate([arr,arr],axis=0)

array([[ 0,  1,  2,  3],

       [ 4,  5,  6,  7],

       [ 8,  9, 10, 11],

       [ 0,  1,  2,  3],

       [ 4,  5,  6,  7],

       [ 8,  9, 10, 11]])

s1=pd.Series([0,1],index=['a','b']);s1

a    0

b    1

dtype: int64

s2=pd.Series([2,3,4],index=['c','d','e']);s2

c    2

d    3

e    4

dtype: int64

s3=pd.Series([5,6],index=['f','g'])

pd.concat([s1,s2,s3])

a    0

b    1

c    2

d    3

e    4

f    5

g    6

dtype: int64

By default,concat works along axis=0,producing another Series.If you pass axis=1,the result will instead be a DataFrame(axis=1 is the column)

pd.concat([s1,s2,s3],axis=1)

D:\Anaconda\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  """Entry point for launching an IPython kernel.

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

s4=pd.concat([s1,s2]);s4

a    0

b    1

c    2

d    3

e    4

dtype: int64

s1

a    0

b    1

dtype: int64

pd.concat([s1,s4])

a    0

b    1

a    0

b    1

c    2

d    3

e    4

dtype: int64

pd.concat([s1,s4],axis=1)

D:\Anaconda\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  """Entry point for launching an IPython kernel.

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	0	1
a	0.0	0
b	1.0	1
c	NaN	2
d	NaN	3
e	NaN	4

join:either 'inner' or 'outer'(default);whether to intersection(inner) or union(outer) together indexes along the other axis.

pd.concat([s1,s4],axis=1,join='inner')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	0	1
a	0	0
b	1	1

join_axes: Specific indexes to use for the other n-1 axes insteda of performing union/intersection loggic

pd.concat([s1,s2],axis=1,join_axes=[['a','c','b','e']])

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	0	1
a	0.0	NaN
c	NaN	2.0
b	1.0	NaN
e	NaN	4.0

keys:values to associate with objects being concatenated,forming a hierarchical index along the concatenation axis;can either be a list or array of arbitrary values,an array of tuples,or a list of arrays(if multiple-level arrays passed in levels)

result=pd.concat([s1,s1,s3],keys=['one','two','three']);result

one    a    0

       b    1

two    a    0

       b    1

three  f    5

       g    6

dtype: int64

result.unstack()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	a	b	f	g
one	0.0	1.0	NaN	NaN
two	0.0	1.0	NaN	NaN
three	NaN	NaN	5.0	6.0

In the case of combining Series along axis=1,the keys become the DataFrame columns headers.

pd.concat([s1,s2,s3],axis=1,keys=['one','two','three'])

D:\Anaconda\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  """Entry point for launching an IPython kernel.

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

The same logix extends to DataFrame objects:

df1=pd.DataFrame(np.arange(6).reshape((3,2)),index=['a','b','c'],columns=['one','two']);df1

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	one	two
a	0	1
b	2	3
c	4	5

df2=pd.DataFrame(5+np.arange(4).reshape((2,2)),index=['a','c'],columns=['three','four']);df2

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	three	four
a	5	6
c	7	8

pd.concat([df1,df2],axis=1,keys=['level1','level2'],sort=True)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead tr th { text-align: left }

	level1		level2
	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

pd.concat([df1,df2],sort=False)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	one	two	three	four
a	0.0	1.0	NaN	NaN
b	2.0	3.0	NaN	NaN
c	4.0	5.0	NaN	NaN
a	NaN	NaN	5.0	6.0
c	NaN	NaN	7.0	8.0

pd.concat([df1,df2],sort=True)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	four	one	three	two
a	NaN	0.0	NaN	1.0
b	NaN	2.0	NaN	3.0
c	NaN	4.0	NaN	5.0
a	6.0	NaN	5.0	NaN
c	8.0	NaN	7.0	NaN

If you pass a dict of objects instead of a list,the dict's keys will be used for the keys option.

pd.concat({'level1':df1,'level2':df2},axis=1,sort=False)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead tr th { text-align: left }

	level1		level2
	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

We can name the created axis levels with the names argument.names: Names for created hierarchical levels if keys and//or levels passed.

pd.concat([df1,df2],keys=['level1','level2'],names=['upper','lower'],sort=False)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

		one	two	three	four
upper	lower
level1	a	0.0	1.0	NaN	NaN
	b	2.0	3.0	NaN	NaN
	c	4.0	5.0	NaN	NaN
level2	a	NaN	NaN	5.0	6.0
level2	c	NaN	NaN	7.0	8.0

pd.concat([df1,df2],keys=['level1','level2'],names=['upper','lower'],sort=False,axis=1)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead tr th { text-align: left }

upper	level1		level2
lower	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

df1=pd.DataFrame(np.random.randn(3,4),columns=['a','b','c','d']);df1

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	a	b	c	d
0	-0.285393	-0.625140	-0.244858	1.870425
1	-1.651745	-2.094833	0.233144	0.083170
2	2.497868	0.004263	1.376631	-0.497225

df2=pd.DataFrame(np.random.randn(2,3),columns=['b','d','a']);df2

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	b	d	a
0	0.330073	-0.546400	-1.291143
1	-0.541348	-1.003454	1.578515

A last consideration concerns DataFrames in which the row index does not contain any relevent data.

pd.concat([df1,df2],sort=False)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	a	b	c	d
0	-0.285393	-0.625140	-0.244858	1.870425
1	-1.651745	-2.094833	0.233144	0.083170
2	2.497868	0.004263	1.376631	-0.497225
0	-1.291143	0.330073	NaN	-0.546400
1	1.578515	-0.541348	NaN	-1.003454

ignore_index:Do not preserve indexes along concatenation axis,instead producing a new range(total_length) index.

pd.concat([df1,df2],sort=False,ignore_index=True)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	a	b	c	d
0	-0.285393	-0.625140	-0.244858	1.870425
1	-1.651745	-2.094833	0.233144	0.083170
2	2.497868	0.004263	1.376631	-0.497225
3	-1.291143	0.330073	NaN	-0.546400
4	1.578515	-0.541348	NaN	-1.003454

pd.concat([df1,df2],axis=1)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	a	b	c	d	b	d	a
0	-0.285393	-0.625140	-0.244858	1.870425	0.330073	-0.546400	-1.291143
1	-1.651745	-2.094833	0.233144	0.083170	-0.541348	-1.003454	1.578515
2	2.497868	0.004263	1.376631	-0.497225	NaN	NaN	NaN

pd.concat([df1,df2],axis=1,keys=['one','two'])

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead tr th { text-align: left }

	one				two
	a	b	c	d	b	d	a
0	-0.285393	-0.625140	-0.244858	1.870425	0.330073	-0.546400	-1.291143
1	-1.651745	-2.094833	0.233144	0.083170	-0.541348	-1.003454	1.578515
2	2.497868	0.004263	1.376631	-0.497225	NaN	NaN	NaN

pd.concat([df1,df2],axis=1,keys=['one','two'],names=['name1','name2'])

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead tr th { text-align: left }

name1	one				two
name2	a	b	c	d	b	d	a
0	-0.285393	-0.625140	-0.244858	1.870425	0.330073	-0.546400	-1.291143
1	-1.651745	-2.094833	0.233144	0.083170	-0.541348	-1.003454	1.578515
2	2.497868	0.004263	1.376631	-0.497225	NaN	NaN	NaN

Combining data with overlap

help(np.where)

Help on built-in function where in module numpy.core.multiarray:

where(...)

    where(condition, [x, y])

    Return elements, either from `x` or `y`, depending on `condition`.

    If only `condition` is given, return ``condition.nonzero()``.

    Parameters

    ----------

    condition : array_like, bool

        When True, yield `x`, otherwise yield `y`.

    x, y : array_like, optional

        Values from which to choose. `x`, `y` and `condition` need to be

        broadcastable to some shape.

    Returns

    -------

    out : ndarray or tuple of ndarrays

        If both `x` and `y` are specified, the output array contains

        elements of `x` where `condition` is True, and elements from

        `y` elsewhere.

        If only `condition` is given, return the tuple

        ``condition.nonzero()``, the indices where `condition` is True.

    See Also

    --------

    nonzero, choose

    Notes

    -----

    If `x` and `y` are given and input arrays are 1-D, `where` is

    equivalent to::

        [xv if c else yv for (c,xv,yv) in zip(condition,x,y)]

    Examples

    --------

    >>> np.where([[True, False], [True, True]],

    ...          [[1, 2], [3, 4]],

    ...          [[9, 8], [7, 6]])

    array([[1, 8],

           [3, 4]])

    >>> np.where([[0, 1], [1, 0]])

    (array([0, 1]), array([1, 0]))

    >>> x = np.arange(9.).reshape(3, 3)

    >>> np.where( x > 5 )

    (array([2, 2, 2]), array([0, 1, 2]))

    >>> x[np.where( x > 3.0 )]               # Note: result is 1D.

    array([ 4.,  5.,  6.,  7.,  8.])

    >>> np.where(x < 5, x, -1)               # Note: broadcasting.

    array([[ 0.,  1.,  2.],

           [ 3.,  4., -1.],

           [-1., -1., -1.]])

    Find the indices of elements of `x` that are in `goodvalues`.

    >>> goodvalues = [3, 4, 7]

    >>> ix = np.isin(x, goodvalues)

    >>> ix

    array([[False, False, False],

           [ True,  True, False],

           [False,  True, False]])

    >>> np.where(ix)

    (array([1, 1, 2]), array([0, 1, 1]))

x=np.arange(9).reshape(3,3);x

array([[0, 1, 2],

       [3, 4, 5],

       [6, 7, 8]])

np.where(x>5)

(array([2, 2, 2], dtype=int64), array([0, 1, 2], dtype=int64))

x[np.where(x>5)]

array([6, 7, 8])

x[(np.array([2,2,2]),np.array([0,1,2]))]

array([6, 7, 8])

x[2]

array([6, 7, 8])

There is another data combination situation that cannot be expressed as either a merge or concatenation operation.You may have two datasets whose indexes overlap in full or part.As a motivating example,consider Numpy's where function,which performs the array-oriented equivalent of an if-else expression.

a=pd.Series([np.nan,2.5,np.nan,3.5,4.5,np.nan],index=['f','e','d','c','b','a']);a

f    NaN

e    2.5

d    NaN

c    3.5

b    4.5

a    NaN

dtype: float64

b=pd.Series(np.arange(len(a),dtype=np.float64),index=['f','e','d','c','b','a']);b

f    0.0

e    1.0

d    2.0

c    3.0

b    4.0

a    5.0

dtype: float64

b[-1]=np.nan;b

f    0.0

e    1.0

d    2.0

c    3.0

b    4.0

a    NaN

dtype: float64

pd.isnull(a)

f     True

e    False

d     True

c    False

b    False

a     True

dtype: bool

np.where(pd.isnull(a),b,a)

array([0. , 2.5, 2. , 3.5, 4.5, nan])

Series has a combine_first method,which performs the equivalent of this operation along with pandas's usual data alignment logic:

help(pd.Series.combine_first)

Help on function combine_first in module pandas.core.series:

combine_first(self, other)

    Combine Series values, choosing the calling Series's values

    first. Result index will be the union of the two indexes

    Parameters

    ----------

    other : Series

    Returns

    -------

    combined : Series

    Examples

    --------

    >>> s1 = pd.Series([1, np.nan])

    >>> s2 = pd.Series([3, 4])

    >>> s1.combine_first(s2)

    0    1.0

    1    4.0

    dtype: float64

    See Also

    --------

    Series.combine : Perform elementwise operation on two Series

        using a given function

f    0.0

e    1.0

d    2.0

c    3.0

b    4.0

a    NaN

dtype: float64

b[:-2]

f    0.0

e    1.0

d    2.0

c    3.0

dtype: float64

a[2:]

d    NaN

c    3.5

b    4.5

a    NaN

dtype: float64

f    NaN

e    2.5

d    NaN

c    3.5

b    4.5

a    NaN

dtype: float64

b[:-2].combine_first(a[2:]) # Combine Series values, choosing the calling Series's values first. Result index will be the union of the two indexes

a    NaN

b    4.5

c    3.0

d    2.0

e    1.0

f    0.0

dtype: float64

df1=pd.DataFrame({'a':[1,np.nan,5,np.nan],'b':[np.nan,2,np.nan,6],

                 'c':range(2,18,4)});df1

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	a	b	c
0	1.0	NaN	2
1	NaN	2.0	6
2	5.0	NaN	10
3	NaN	6.0	14

df2=pd.DataFrame({'a':[5,4,np.nan,3,7],'b':[np.nan,3,4,6,8]});df2

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	a	b
0	5.0	NaN
1	4.0	3.0
2	NaN	4.0
3	3.0	6.0
4	7.0	8.0

df1.combine_first(df2)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	a	b	c
0	1.0	NaN	2.0
1	4.0	2.0	6.0
2	5.0	4.0	10.0
3	3.0	6.0	14.0
4	7.0	8.0	NaN

With DattaFrames,combine_first does the same thing column by column,so you can think of it as 'patching' missing data in the calling object with data from the object you pass.

Reshaping and pivoting

Reshaping with Hierarchical indexing

stack

This 'rotates' or pivots from the columns in the data to the rows
unstack

This pivots from the rows into the columns

The word stack can be thought as the stack of index.

data=pd.DataFrame(np.arange(6).reshape((2,3)),index=pd.Index(['Ohio','Colorado'],name='state'),columns=pd.Index(['one','two','three'],name='number'));data

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

number	one	two	three
state
Ohio	0	1	2
Colorado	3	4	5

The reason why put index=pd.Index is to use name parameter in pd.Index,and that is not in pd.DataFrame.

result=data.stack()

result

state     number

Ohio      one       0

          two       1

          three     2

Colorado  one       3

          two       4

          three     5

dtype: int32

From a hierarchically indexed Series,you can rearrange the data back into a DataFrame with unstack.

result.unstack()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

number	one	two	three
state
Ohio	0	1	2
Colorado	3	4	5

By default, the innermost level is unstacked(same with stack).You can unstack a different level by passing a level number of name.

result.unstack('state')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

state	Ohio	Colorado
number
one	0	3
two	1	4
three	2	5

result.unstack(0)

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

state	Ohio	Colorado
number
one	0	3
two	1	4
three	2	5

result

state     number

Ohio      one       0

          two       1

          three     2

Colorado  one       3

          two       4

          three     5

dtype: int32

result.unstack()#the default level is innermost level,'number'

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

number	one	two	three
state
Ohio	0	1	2
Colorado	3	4	5

result.unstack(1)  # The innermost level is 'number'

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

number	one	two	three
state
Ohio	0	1	2
Colorado	3	4	5

Unstacking might introduces missing data if all of the values in the level are not found in each of the subgroups.

s1=pd.Series([0,1,2,3],index=['a','b','c','d']);s1

a    0

b    1

c    2

d    3

dtype: int64

s2=pd.Series([4,5,6],index=['c','d','e']);s2

c    4

d    5

e    6

dtype: int64

data2=pd.concat([s1,s2],keys=['one','two']);data2

one  a    0

     b    1

     c    2

     d    3

two  c    4

     d    5

     e    6

dtype: int64

data2.unstack()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	a	b	c	d	e
one	0.0	1.0	2.0	3.0	NaN
two	NaN	NaN	4.0	5.0	6.0

data2.unstack().stack()

one  a    0.0

     b    1.0

     c    2.0

     d    3.0

two  c    4.0

     d    5.0

     e    6.0

dtype: float64

data2.unstack().stack(dropna=False)

one  a    0.0

     b    1.0

     c    2.0

     d    3.0

     e    NaN

two  a    NaN

     b    NaN

     c    4.0

     d    5.0

     e    6.0

dtype: float64

When you unstack in a DataFrame,the level unstacked becomes the lowest level in the result:

result

state     number

Ohio      one       0

          two       1

          three     2

Colorado  one       3

          two       4

          three     5

dtype: int32

df=pd.DataFrame({'left':result,'right':result+5},columns=pd.Index(['left','right'],name='side'));df # parameter `name` rather than names!

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	side	left	right
state	number
Ohio	one	0	5
	two	1	6
	three	2	7
Colorado	one	3	8
	two	4	9
	three	5	10

df.unstack('state')

side	left		right
state	Ohio	Colorado	Ohio	Colorado
number
one	0	3	5	8
two	1	4	6	9
three	2	5	7	10

df.unstack('number')

side	left			right
number	one	two	three	one	two	three
state
Ohio	0	1	2	5	6	7
Colorado	3	4	5	8	9	10

When calling stack,we can indicate the name of the axis to stack.

df.unstack('state').stack('side')

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	state	Colorado	Ohio
number	side
one	left	3	0
one	right	8	5
two	left	4	1
two	right	9	6
three	left	5	2
three	right	10	7

Pivoting 'long' to 'wide' format

data=pd.read_csv(r'G:\PycharmProject\DataAnalysis\pydata-book-2nd-edition\examples\macrodata.csv')

data.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	year	quarter	realgdp	realcons	realinv	realgovt	realdpi	cpi	m1	tbilrate	unemp	pop	infl	realint
0	1959.0	1.0	2710.349	1707.4	286.898	470.045	1886.9	28.98	139.7	2.82	5.8	177.146	0.00	0.00
1	1959.0	2.0	2778.801	1733.7	310.859	481.301	1919.7	29.15	141.7	3.08	5.1	177.830	2.34	0.74
2	1959.0	3.0	2775.488	1751.8	289.226	491.260	1916.4	29.35	140.5	3.82	5.3	178.657	2.74	1.09
3	1959.0	4.0	2785.204	1753.7	299.356	484.052	1931.3	29.37	140.0	4.33	5.6	179.386	0.27	4.06
4	1960.0	1.0	2847.699	1770.5	331.722	462.199	1955.5	29.54	139.6	3.50	5.2	180.007	2.31	1.19

periods=pd.PeriodIndex(year=data.year,quarter=data.quarter,name='date')

columns=pd.Index(['realgdp','infl','unemp'],name='item');columns

Index(['realgdp', 'infl', 'unemp'], dtype='object', name='item')

data=data.reindex(columns=columns)

help(data.reindex)

Help on method reindex in module pandas.core.frame:

reindex(labels=None, index=None, columns=None, axis=None, method=None, copy=True, level=None, fill_value=nan, limit=None, tolerance=None) method of pandas.core.frame.DataFrame instance

    Conform DataFrame to new index with optional filling logic, placing

    NA/NaN in locations having no value in the previous index. A new object

    is produced unless the new index is equivalent to the current one and

    copy=False

    Parameters

    ----------

    labels : array-like, optional

        New labels / index to conform the axis specified by 'axis' to.

    index, columns : array-like, optional (should be specified using keywords)

        New labels / index to conform to. Preferably an Index object to

        avoid duplicating data

    axis : int or str, optional

        Axis to target. Can be either the axis name ('index', 'columns')

        or number (0, 1).

    method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}, optional

        method to use for filling holes in reindexed DataFrame.

        Please note: this is only applicable to DataFrames/Series with a

        monotonically increasing/decreasing index.

        * default: don't fill gaps

        * pad / ffill: propagate last valid observation forward to next

          valid

        * backfill / bfill: use next valid observation to fill gap

        * nearest: use nearest valid observations to fill gap

    copy : boolean, default True

        Return a new object, even if the passed indexes are the same

    level : int or name

        Broadcast across a level, matching Index values on the

        passed MultiIndex level

    fill_value : scalar, default np.NaN

        Value to use for missing values. Defaults to NaN, but can be any

        "compatible" value

    limit : int, default None

        Maximum number of consecutive elements to forward or backward fill

    tolerance : optional

        Maximum distance between original and new labels for inexact

        matches. The values of the index at the matching locations most

        satisfy the equation ``abs(index[indexer] - target) <= tolerance``.

        Tolerance may be a scalar value, which applies the same tolerance

        to all values, or list-like, which applies variable tolerance per

        element. List-like includes list, tuple, array, Series, and must be

        the same size as the index and its dtype must exactly match the

        index's type.

        .. versionadded:: 0.21.0 (list-like tolerance)

    Examples

    --------

    ``DataFrame.reindex`` supports two calling conventions

    * ``(index=index_labels, columns=column_labels, ...)``

    * ``(labels, axis={'index', 'columns'}, ...)``

    We *highly* recommend using keyword arguments to clarify your

    intent.

    Create a dataframe with some fictional data.

    >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']

    >>> df = pd.DataFrame({

    ...      'http_status': [200,200,404,404,301],

    ...      'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},

    ...       index=index)

    >>> df

               http_status  response_time

    Firefox            200           0.04

    Chrome             200           0.02

    Safari             404           0.07

    IE10               404           0.08

    Konqueror          301           1.00

    Create a new index and reindex the dataframe. By default

    values in the new index that do not have corresponding

    records in the dataframe are assigned ``NaN``.

    >>> new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',

    ...             'Chrome']

    >>> df.reindex(new_index)

                   http_status  response_time

    Safari               404.0           0.07

    Iceweasel              NaN            NaN

    Comodo Dragon          NaN            NaN

    IE10                 404.0           0.08

    Chrome               200.0           0.02

    We can fill in the missing values by passing a value to

    the keyword ``fill_value``. Because the index is not monotonically

    increasing or decreasing, we cannot use arguments to the keyword

    ``method`` to fill the ``NaN`` values.

    >>> df.reindex(new_index, fill_value=0)

                   http_status  response_time

    Safari                 404           0.07

    Iceweasel                0           0.00

    Comodo Dragon            0           0.00

    IE10                   404           0.08

    Chrome                 200           0.02

    >>> df.reindex(new_index, fill_value='missing')

                  http_status response_time

    Safari                404          0.07

    Iceweasel         missing       missing

    Comodo Dragon     missing       missing

    IE10                  404          0.08

    Chrome                200          0.02

    We can also reindex the columns.

    >>> df.reindex(columns=['http_status', 'user_agent'])

               http_status  user_agent

    Firefox            200         NaN

    Chrome             200         NaN

    Safari             404         NaN

    IE10               404         NaN

    Konqueror          301         NaN

    Or we can use "axis-style" keyword arguments

    >>> df.reindex(['http_status', 'user_agent'], axis="columns")

               http_status  user_agent

    Firefox            200         NaN

    Chrome             200         NaN

    Safari             404         NaN

    IE10               404         NaN

    Konqueror          301         NaN

    To further illustrate the filling functionality in

    ``reindex``, we will create a dataframe with a

    monotonically increasing index (for example, a sequence

    of dates).

    >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D')

    >>> df2 = pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]},

    ...                    index=date_index)

    >>> df2

                prices

    2010-01-01     100

    2010-01-02     101

    2010-01-03     NaN

    2010-01-04     100

    2010-01-05      89

    2010-01-06      88

    Suppose we decide to expand the dataframe to cover a wider

    date range.

    >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D')

    >>> df2.reindex(date_index2)

                prices

    2009-12-29     NaN

    2009-12-30     NaN

    2009-12-31     NaN

    2010-01-01     100

    2010-01-02     101

    2010-01-03     NaN

    2010-01-04     100

    2010-01-05      89

    2010-01-06      88

    2010-01-07     NaN

    The index entries that did not have a value in the original data frame

    (for example, '2009-12-29') are by default filled with ``NaN``.

    If desired, we can fill in the missing values using one of several

    options.

    For example, to backpropagate the last valid value to fill the ``NaN``

    values, pass ``bfill`` as an argument to the ``method`` keyword.

    >>> df2.reindex(date_index2, method='bfill')

                prices

    2009-12-29     100

    2009-12-30     100

    2009-12-31     100

    2010-01-01     100

    2010-01-02     101

    2010-01-03     NaN

    2010-01-04     100

    2010-01-05      89

    2010-01-06      88

    2010-01-07     NaN

    Please note that the ``NaN`` value present in the original dataframe

    (at index value 2010-01-03) will not be filled by any of the

    value propagation schemes. This is because filling while reindexing

    does not look at dataframe values, but only compares the original and

    desired indexes. If you do want to fill in the ``NaN`` values present

    in the original dataframe, use the ``fillna()`` method.

    See the :ref:`user guide <basics.reindexing>` for more.

    Returns

    -------

    reindexed : DataFrame

ldata=data.stack().reset_index().rename(columns={'0':'value'})

ldata[:10]

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	level_0	item	0
0	0	realgdp	2710.349
1	0	infl	0.000
2	0	unemp	5.800
3	1	realgdp	2778.801
4	1	infl	2.340
5	1	unemp	5.100
6	2	realgdp	2775.488
7	2	infl	2.740
8	2	unemp	5.300
9	3	realgdp	2785.204

Pivoting 'wide' to 'long' format

An inverse operation to pivot for DataFrames is pands.melt.Rather than transforming one column into many in a new DataFrame, it mergers multiple columns into one,producing a DataFrame that is longer than the input.

help(pd.DataFrame.melt)

Help on function melt in module pandas.core.frame:

melt(self, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None)

    "Unpivots" a DataFrame from wide format to long format, optionally

    leaving identifier variables set.

    This function is useful to massage a DataFrame into a format where one

    or more columns are identifier variables (`id_vars`), while all other

    columns, considered measured variables (`value_vars`), are "unpivoted" to

    the row axis, leaving just two non-identifier columns, 'variable' and

    'value'.

    .. versionadded:: 0.20.0

    Parameters

    ----------

    frame : DataFrame

    id_vars : tuple, list, or ndarray, optional

        Column(s) to use as identifier variables.

    value_vars : tuple, list, or ndarray, optional

        Column(s) to unpivot. If not specified, uses all columns that

        are not set as `id_vars`.

    var_name : scalar

        Name to use for the 'variable' column. If None it uses

        ``frame.columns.name`` or 'variable'.

    value_name : scalar, default 'value'

        Name to use for the 'value' column.

    col_level : int or string, optional

        If columns are a MultiIndex then use this level to melt.

    See also

    --------

    melt

    pivot_table

    DataFrame.pivot

    Examples

    --------

    >>> import pandas as pd

    >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},

    ...                    'B': {0: 1, 1: 3, 2: 5},

    ...                    'C': {0: 2, 1: 4, 2: 6}})

    >>> df

       A  B  C

    0  a  1  2

    1  b  3  4

    2  c  5  6

    >>> df.melt(id_vars=['A'], value_vars=['B'])

       A variable  value

    0  a        B      1

    1  b        B      3

    2  c        B      5

    >>> df.melt(id_vars=['A'], value_vars=['B', 'C'])

       A variable  value

    0  a        B      1

    1  b        B      3

    2  c        B      5

    3  a        C      2

    4  b        C      4

    5  c        C      6

    The names of 'variable' and 'value' columns can be customized:

    >>> df.melt(id_vars=['A'], value_vars=['B'],

    ...         var_name='myVarname', value_name='myValname')

       A myVarname  myValname

    0  a         B          1

    1  b         B          3

    2  c         B          5

    If you have multi-index columns:

    >>> df.columns = [list('ABC'), list('DEF')]

    >>> df

       A  B  C

       D  E  F

    0  a  1  2

    1  b  3  4

    2  c  5  6

    >>> df.melt(col_level=0, id_vars=['A'], value_vars=['B'])

       A variable  value

    0  a        B      1

    1  b        B      3

    2  c        B      5

    >>> df.melt(id_vars=[('A', 'D')], value_vars=[('B', 'E')])

      (A, D) variable_0 variable_1  value

    0      a          B          E      1

    1      b          B          E      3

    2      c          B          E      5

df=pd.DataFrame({'key':['foo','bar','baz'],

                'A':[1,2,3],

                'B':[4,5,6],

                'C':[7,8,9]});df

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key	A	B	C
0	foo	1	4	7
1	bar	2	5	8
2	baz	3	6	9

melted=pd.melt(df,['key']);melted

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key	variable	value
0	foo	A	1
1	bar	A	2
2	baz	A	3
3	foo	B	4
4	bar	B	5
5	baz	B	6
6	foo	C	7
7	bar	C	8
8	baz	C	9

reshaped=melted.pivot('key','variable','value');reshaped

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

variable	A	B	C
key
bar	2	5	8
baz	3	6	9
foo	1	4	7

reshaped.reset_index()

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

variable	key	A	B	C
0	bar	2	5	8
1	baz	3	6	9
2	foo	1	4	7

pd.melt(df,id_vars=['key'],value_vars=['A','B'])

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	key	variable	value
0	foo	A	1
1	bar	A	2
2	baz	A	3
3	foo	B	4
4	bar	B	5
5	baz	B	6

pd.melt(df,value_vars=['A','B','C'])

.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3c pre>\3c code>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }

	variable	value
0	A	1
1	A	2
2	A	3
3	B	4
4	B	5
5	B	6
6	C	7
7	C	8
8	C	9