Basics
Basics pandas operations are: create DataFrame objects, add/drop rows, add/drop columns, rename columns, replace columns names, add/remove indexes.
Import the pandas library and aliasing as pd
import pandas as pd
Create an empty DataFrame
import pandas as pd
df = pd.DataFrame()
print(df)
Create a DataFrame from List of Lists. Specify values for each row
import pandas as pd
data = [['Alex', 10], ['Bob', 12], ['Clarke', 13]]
df = pd.DataFrame(data, columns=['Name', 'Age'])
print(df)
Create a DataFrame from a Dictionary of Lists. Specify values for each column
import pandas as pd
data = {'Name': ['Tom', 'Jack', 'Steve', 'Ricky'], 'Age': [28, 34, 29, 42]}
df = pd.DataFrame(data)
print(df)
Create a DataFrame with index
import pandas as pd
data = [['Alex', 10], ['Bob', 12], ['Clarke', 13]]
df = pd.DataFrame(data, columns=['Name', 'Age'], index=['a', 'b', 'c'])
print(df)
Generate a random DataFrame
import pandas as pd
import numpy as np
# create a DataFrame with 50 rows and 5 columns
df = pd.DataFrame(np.random.rand(50, 5))
Generate a random DataFrame with date index
import pandas as pd
import numpy as np
# create a DataFrame with 50 rows and 5 columns
df = pd.DataFrame(np.random.rand(50, 5))
# add row index as daily dates starting `2017-01-01`
df.index = pd.date_range('2017-01-01', periods=len(df), freq='D')
Set a column as the dataframe index
df = df.set_index('column_x')
Reset the dataframe index to the default integer index
df = df.reset_index()
Reset the dataframe index without inserting it into dataframe columns
df = df.reset_index(drop=True)
Rename several DataFrame columns
df = df.rename(columns={'col1_old': 'col1_new', 'col2_old': 'col2_new'})
# or
df.rename(columns={'col1_old': 'col1_new', 'col2_old': 'col2_new'},
inplace=True)
Replace all column names
new_cols_names = ['column_x', 'column_y', 'column_z']
df.columns = new_cols_names
Lowercase DataFrame columns names
df.columns = [x.lower() for x in df.columns]
# or
df.columns = map(str.lower, df.columns)
Remove/drop one column
df = df.drop('column_x', axis=1)
Remove/drop several columns
df = df.drop(['column_x', 'column_y'], axis=1)
# or
df = df.drop(columns=['column_x', 'column_y'])
Remove/drop rows by index
# drop rows 0 and 1
df = df.drop([0, 1])
Remove/drop first two rows
df = df.drop(df.index[:2])
Remove/drop last two rows
df = df.drop(df.index[-2:])
Remove/drop rows by names
import pandas as pd
data = [['Alex', 10], ['Bob', 12], ['Clarke', 13]]
df = pd.DataFrame(data, columns=['Name', 'Age'], index=['a', 'b', 'c'])
print(df)
>>> print(df)
Name Age
a Alex 10
b Bob 12
c Clarke 13
df = df.drop(['b', 'c'])
print(df)
>>> print(df)
Name Age
a Alex 10
Add column with all values a constant
df['column_new'] = 7
Add column on specific position (as second column)
df.insert(1, 'column_new', 7)
Add column based on values of other columns
df['column_new'] = df.column_x + df.column_y
Add column with values from list
data = [['Alex', 10], ['Bob', 12], ['Marie', 13]]
df = pd.DataFrame(data, columns=['Name', 'Age'])
df['column_new'] = ['boy', 'boy', 'girl']
print(df)
>>> print(df)
Name Age column_new
0 Alex 10 boy
1 Bob 12 boy
2 Marie 13 girl
Add new column based on values of other column and where condition
df['colummn_new'] = 'yes'
df['colummn_new'] = df['colummn_new'].where(df['column_x'] >= 50, 'no')
# equivalent with and more intuitive
import numpy as np
df['colummn_new'] = np.where(df['column_x'] >= 50, 'yes', 'no')
Add new column based on values of other columns and where condition
import pandas as pd
data = [['Alex', 10], ['Bob', 12], ['Marie', 21], ['Tina', 35]]
df = pd.DataFrame(data, columns=['Name', 'Age'])
df['Adult'] = 'yes'
df['Adult'] = df['Adult'].where(df['Age'] >= 18, 'no')
# or
import numpy as np
df['Adult'] = np.where(df['Age'] >= 18, 'yes', 'no')
# results
>>> df
Name Age Adult
0 Alex 10 no
1 Bob 12 no
2 Marie 21 yes
3 Tina 35 yes
Flatten/collapse the hierarchical index in columns
df.columns = ['_'.join(col).strip() for col in df.columns.values]
Iterate over DataFrame rows (slow operation)
# pandas vectorized functions should be used instead of rows iteration
for index, row in df.iterrows():
# row is a Series object
print(index, row['column_x'])
# or faster method
for row in df.itertuples():
# row is a named tuple with associated column names
print(row.Index, row.column_x, row.column_y)
print(row[0], row[1], row[2])