Basics

Basics pandas operations are: create DataFrame objects, add/drop rows, add/drop columns, rename columns, replace columns names, add/remove indexes.

Import the pandas library and aliasing as pd

import pandas as pd

Create an empty DataFrame

import pandas as pd
df = pd.DataFrame()
print(df)

Create a DataFrame from List of Lists. Specify values for each row

import pandas as pd
data = [['Alex', 10], ['Bob', 12], ['Clarke', 13]]
df = pd.DataFrame(data, columns=['Name', 'Age'])
print(df)

Create a DataFrame from a Dictionary of Lists. Specify values for each column

import pandas as pd
data = {'Name': ['Tom', 'Jack', 'Steve', 'Ricky'], 'Age': [28, 34, 29, 42]}
df = pd.DataFrame(data)
print(df)

Create a DataFrame with index

import pandas as pd
data = [['Alex', 10], ['Bob', 12], ['Clarke', 13]]
df = pd.DataFrame(data, columns=['Name', 'Age'], index=['a', 'b', 'c'])
print(df)

Generate a random DataFrame

import pandas as pd
import numpy as np
# create a DataFrame with 50 rows and 5 columns
df = pd.DataFrame(np.random.rand(50, 5))

Generate a random DataFrame with date index

import pandas as pd
import numpy as np
# create a DataFrame with 50 rows and 5 columns
df = pd.DataFrame(np.random.rand(50, 5))
# add row index as daily dates starting `2017-01-01`
df.index = pd.date_range('2017-01-01', periods=len(df), freq='D')

Set a column as the dataframe index

df = df.set_index('column_x')

Reset the dataframe index to the default integer index

df = df.reset_index()

Reset the dataframe index without inserting it into dataframe columns

df = df.reset_index(drop=True)

Rename several DataFrame columns

df = df.rename(columns={'col1_old': 'col1_new', 'col2_old': 'col2_new'})
# or
df.rename(columns={'col1_old': 'col1_new', 'col2_old': 'col2_new'},
          inplace=True)

Replace all column names

new_cols_names = ['column_x', 'column_y', 'column_z']
df.columns = new_cols_names

Lowercase DataFrame columns names

df.columns = [x.lower() for x in df.columns]
# or
df.columns = map(str.lower, df.columns)

Remove/drop one column

df = df.drop('column_x', axis=1)

Remove/drop several columns

df = df.drop(['column_x', 'column_y'], axis=1)
# or
df = df.drop(columns=['column_x', 'column_y'])

Remove/drop rows by index

# drop rows 0 and 1
df = df.drop([0, 1])

Remove/drop first two rows

df = df.drop(df.index[:2])

Remove/drop last two rows

df = df.drop(df.index[-2:])

Remove/drop rows by names

import pandas as pd
data = [['Alex', 10], ['Bob', 12], ['Clarke', 13]]
df = pd.DataFrame(data, columns=['Name', 'Age'], index=['a', 'b', 'c'])
print(df)

>>> print(df)
     Name  Age
a    Alex   10
b     Bob   12
c  Clarke   13

df = df.drop(['b', 'c'])
print(df)

>>> print(df)
   Name  Age
a  Alex   10

Add column with all values a constant

df['column_new'] = 7

Add column on specific position (as second column)

df.insert(1, 'column_new', 7)

Add column based on values of other columns

df['column_new'] = df.column_x + df.column_y

Add column with values from list

data = [['Alex', 10], ['Bob', 12], ['Marie', 13]]
df = pd.DataFrame(data, columns=['Name', 'Age'])
df['column_new'] = ['boy', 'boy', 'girl']
print(df)

>>> print(df)
    Name  Age column_new
0   Alex   10        boy
1    Bob   12        boy
2  Marie   13       girl

Add new column based on values of other column and where condition

df['colummn_new'] = 'yes'
df['colummn_new'] = df['colummn_new'].where(df['column_x'] >= 50, 'no')

# equivalent with and more intuitive
import numpy as np
df['colummn_new'] = np.where(df['column_x'] >= 50, 'yes', 'no')

Add new column based on values of other columns and where condition

import pandas as pd
data = [['Alex', 10], ['Bob', 12], ['Marie', 21], ['Tina', 35]]
df = pd.DataFrame(data, columns=['Name', 'Age'])
df['Adult'] = 'yes'
df['Adult'] = df['Adult'].where(df['Age'] >= 18, 'no')

# or
import numpy as np
df['Adult'] = np.where(df['Age'] >= 18, 'yes', 'no')

# results
>>> df
    Name  Age Adult
0   Alex   10    no
1    Bob   12    no
2  Marie   21   yes
3   Tina   35   yes

Flatten/collapse the hierarchical index in columns

df.columns = ['_'.join(col).strip() for col in df.columns.values]

Iterate over DataFrame rows (slow operation)

# pandas vectorized functions should be used instead of rows iteration
for index, row in df.iterrows():
    # row is a Series object
    print(index, row['column_x'])

# or faster method
for row in df.itertuples():
    # row is a named tuple with associated column names
    print(row.Index, row.column_x, row.column_y)
    print(row[0], row[1], row[2])