Pandas discrete note

Anh-Thi Dinh
In this note, a general dataframe is called df (type pandas.core.frame.DataFrame), a general series is call s (type pandas.core.series.Series).

Import library

1import pandas as pd
2import numpy as np # import numpy if necessary

Read/Write .csv file

1# READ
2df = pd.read_csv('filename.csv', sep=';') # default sep=','
3
4# if 1st col contains 0,1,2,...
5df = pd.read_csv('filename.csv', index_col=1)
6
7# with datetime info
8df = pd.read_csv(PATH_DATA_FOLDER+"raw_data.csv",
9                  parse_dates=['timestamp'],
10                  infer_datetime_format=True,
11                  cache_dates=True)
1# WRITE
2df.to_csv(path, index=False) # don't incldue index

Create a dataframe

1# FROM A LIST
2pd.DataFrame(a_list, colummns=['col_name'])
1# FROM A DICTIONARY
2names = ['John', 'Thi', 'Bi', 'Beo', 'Chang']
3ages =  [10, 20, 21, 18, 11]
4marks = [8, 9, 10, 6, 8]
5city = ['Ben Tre', 'Paris', 'Ho Chi Minh Ville', 'New York', 'DC']
6
7my_dict = {'Name':names, 'Ages':ages, 'Marks':marks, 'Place': city}
8students = pd.DataFrame(my_dict)
Name
Ages
Marks
Place
0
John
10
8
Ben Tre
1
Thi
20
9
Paris
2
Bi
21
10
Ho Chi Minh Ville
3
Beo
18
6
New York
4
Chang
11
8
DC

Adding

1# a column
2df['new_col] = [new_values]
1# a row
2df.loc['new_index'] = [new_value]
1# add a new col based on another's values
2df_im = df0.copy()[['col']]
3df_im['status'] = df0['col'].apply(lambda row: 1 if row>=80 else 0)

Shuffle rows

1# shuffle all rows and reset the index
2df_new = df.sample(frac=1).reset_index(drop=True)

Sorting

1df.sort_values(by='col1', ascending=False)

Select rows/columns/item(s)

Select Single value

Select a single value (with condition): Get the mark of Thi (9).
1# interchange `.values[0]` and `.iloc[0]`
2df[df.Name=='Thi'].Marks.values[0]
3df.loc[df.Name=='Thi', 'Marks'].values[0]
1# with indexes
2df.iloc[1,2] # row 2, column 3
1# column's name with row's index
2df[['Marks']].iloc[1].values[0] # column 'Marks', row 2
1# column's index with row's value
2df[df.Name=='Thi'].iloc[:,2].values[0] # column 3, row of 'Thi'

Select integer rows and named columns

1df.loc[1:5, 'col']

Select columns

Select a column (returns a Series)
1# with column's name
2df['Name']
3df.loc[:, 'Name']
1# with an index
2df.iloc[:,0]
Returns a pd.DataFrame
1df[['Name']]
2df.loc[:, ['Name']]
1# with an index
2df.iloc[:,[0]]
Select multi-columns (type DataFrame): Get columns Name & Place:
1# using columns's names
2df[['Name', 'Place']]
3df.loc[:, ['Name', 'Place']]
1# using indexes
2df.iloc[:, [0,-1]]

Select rows

Select a row (returns a Series)
1# with an index
2df.iloc[1]
1# with a condition
2df[df['Name']=='Thi'] # DataFrame
3df[df['Name']=='Thi'].iloc[0] # Series
1df[df.Name=='Thi'] # DataFrame
2df[df.Name=='Thi'].iloc[0] # Series
3df[df.Name=='Thi'].values[0] # ndarray
Select multi-rows (type DataFrame)
1# using indexes
2df.iloc[:3]
3df.loc[:2]
1# with conditions
2df[df['A'].isin([3, 6])]

MultiIndex

All multiindex

1arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo'], ['one', 'two', 'one', 'two', 'one', 'two']]
2index = pd.MultiIndex.from_arrays(arrays)
3df = pd.DataFrame(np.random.randn(3, 6), index=['A', 'B', 'C'], columns=index)
bar
baz
foo
one
two
one
two
one
two
A
-0.752333
0.490581
0.774629
0.487185
1.767773
0.028956
B
-0.057864
-0.221516
-0.568726
-0.563732
1.362453
-0.563213
C
-0.338319
-0.346590
0.012845
0.755455
1.260937
-0.038209
Selection,
1df.loc['A', ('baz', 'two')]
10.487185
1df.loc[:,('baz', 'two')]
1A    0.487185
2B   -0.563732
3C    0.755455
4Name: (baz, two), dtype: float64

With a single name column

If there are some column with single name,
1arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo'], [i for i in range(2)]*3]
2index = pd.MultiIndex.from_arrays(arrays)
3df1 = pd.DataFrame(np.random.randn(3, 6), index=['A', 'B', 'C'], columns=index)

Good practice

1# GOOD PRACTICE
2df1['time'] = [1,2,3]
3df_rs2 = df1
bar
baz
foo
time
0
1
0
1
0
1
A
-1.386119
-0.496755
1.482855
0.943795
-1.173290
-0.445365
1
B
-0.900710
-1.571009
1.086964
1.546927
-1.564426
0.622763
2
C
0.712231
0.235247
-0.807031
0.671802
0.597149
0.111332
3
Selection,
1# FOR GOOD PRACTICE
2df_rs2.loc['A', ('baz', 1)]
3df_rs2.loc['A', 'baz']
10.943795
20    1.482855
31    0.943795

Bad practice

1# BAD PRACTICE
2df2 = pd.DataFrame([1,2,3], index=['A', 'B', 'C'], columns=['time'])
3df_rs1 = pd.concat([df1, df2], axis=1)
(bar, 0)
(bar, 1)
(baz, 0)
(baz, 1)
(foo, 0)
(foo, 1)
time
A
-1.386119
-0.496755
1.482855
0.943795
-1.173290
-0.445365
1
B
-0.900710
-1.571009
1.086964
1.546927
-1.564426
0.622763
2
C
0.712231
0.235247
-0.807031
0.671802
0.597149
0.111332
3
Selection,
1# FOR BAD PRACTICE
2df.loc['A', [('baz', 0)]]
3df_rs1.loc['A', [('baz', i) for i in [0,1]]]
1(baz, 0)    0.729023
2(baz, 0)    1.482855
3(baz, 1)    0.943795

Rename multiindex

1# all columns' name at the level 1
2df.columns.set_levels(['b1','c1','f1'], level=1, inplace=True)

Drop multiindex

1df.columns = df.columns.droplevel()
1   a
2   b  c         b c
30  1  2   ->  0 1 2
41  3  4       1 3 4

Compare 2 dataframes

1df1.equals(df2)

True / False

1# Invert True/False value in Series
2s = pd.Series([True, True, False, True])
3~s
1# Convert True / False to 1 / 0
2df['col'] = df['col'].astype(int)
3# int or float