Time Series Data
pandas contains extensive capabilities and features for working with time series data for all domains. NumPy’s dtypes datetime64 and timedelta64 are used for manipulating time series data.
Create datetime object with python’s native method
from datetime import datetime
date = datetime(year=2010, month=7, day=10, hour=10, minute=50)
>>> date
datetime.datetime(2010, 7, 10, 10, 50)
Create datetime object with python’s native method
from dateutil import parser
date = parser.parse("10th of July, 2020 10:50")
>>> date
datetime.datetime(2020, 7, 10, 10, 50)
Extract date, time parts from datetime
from datetime import datetime
date = datetime(year=2010, month=7, day=10, hour=10, minute=50)
# format can be any code of strftime directive: http://strftime.org/
>>> date.strftime('%A')
'Saturday'
>>> date.strftime('%B')
'July'
>>> date.strftime('%H:%M')
'10:50'
Create datetime object with numpy (datetime64)
import numpy as np
date1 = np.array('2020-07-10', dtype=np.datetime64)
date2 = np.datetime64('2020-07-10 10:50')
>>> date1
array('2020-07-10', dtype='datetime64[D]')
>>> date2
numpy.datetime64('2020-07-10T10:50')
Create datetime object with pandas (Timestamp)
import pandas as pd
date = pd.to_datetime("10th of July, 2020 10:50")
# returns Timestamp object b/c we passed a single date to parse
>>> date
Timestamp('2020-07-10 10:50:00')
Create datetime object with pandas (datetime64)
import datetime as datetime
dti = pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01'), datetime.datetime(2018, 1, 1)])
# returns DatetimeIndex object b/c we passed multiple dates to parse
>>> dti
DatetimeIndex(['2018-01-01', '2018-01-01', '2018-01-01'], dtype='datetime64[ns]', freq=None)
Create DataFrame with DatetimeIndex as index
data = {'ticker': ['AAPL', 'AAPL', 'AAPL', 'AAPL'],
'open': [100, 110, 200, 205],
'close': [90, 120, 205, 210]
}
index = pd.DatetimeIndex(['2019-03-02', '2019-04-03', '2020-04-07', '2020-04-08'])
df = pd.DataFrame(data, index)
df.index.name = 'date'
>>> df
ticker open close
date
2019-03-02 AAPL 100 90
2019-04-03 AAPL 110 120
2020-04-07 AAPL 200 205
2020-04-08 AAPL 205 210
>>> df.index
DatetimeIndex(['2019-03-02', '2019-04-03', '2020-04-07', '2020-04-08'], dtype='datetime64[ns]', name='date', freq=None)
Slice DataFrame with DatetimeIndex on specific year/month/period
data = {'ticker': ['AAPL', 'AAPL', 'AAPL', 'AAPL'],
'open': [100, 110, 200, 205],
'close': [90, 120, 205, 210]
}
index = pd.DatetimeIndex(['2019-03-02', '2019-04-03', '2020-04-07', '2020-04-08'])
df = pd.DataFrame(data, index)
df.index.name = 'date'
>>> df
ticker open close
date
2019-03-02 AAPL 100 90
2019-04-03 AAPL 110 120
2020-04-07 AAPL 200 205
2020-04-08 AAPL 205 210
# 2019 rows only
>>> df['2019']
ticker open close
date
2019-03-02 AAPL 100 90
2019-04-03 AAPL 110 120
# 2019 April rows only
>>> df['2019-04']
ticker open close
date
2019-04-03 AAPL 110 120
# select rows for a specific time period
>>> df['2019-04-03':'2020-04-07']
ticker open close
date
2019-04-03 AAPL 110 120
2020-04-07 AAPL 200 205
Resample time series with aggregation
data = {'ticker': ['AAPL', 'AAPL', 'AAPL', 'AAPL'],
'open': [100, 110, 200, 205],
'close': [90, 120, 205, 210]
}
index = pd.DatetimeIndex(['2019-04-02', '2019-12-31', '2020-04-07', '2020-12-31'])
df = pd.DataFrame(data, index)
df.index.name = 'date'
# resample to annual frequency averaging values for each year
df_annual = df.resample('A').mean()
>>> df_annual
open close
date
2019-12-31 105.0 105.0
2020-12-31 202.5 207.5
Resample time series with selection
data = {'ticker': ['AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL'],
'open': [99, 100, 110, 200, 205],
'close': [99, 90, 120, 205, 210]
}
index = pd.DatetimeIndex(['2018-04-02', '2019-04-02', '2019-12-31', '2020-04-07', '2020-12-31'])
df = pd.DataFrame(data, index)
df.index.name = 'date'
# resample to annual frequency selecting the values at the end of each year
# note the default NaN when no data for end of year 2018
df_annual = df.asfreq('A')
>>> df_annual
ticker open close
date
2018-12-31 NaN NaN NaN
2019-12-31 AAPL 110.0 120.0
2020-12-31 AAPL 205.0 210.0
# resample to annual frequency selecting the values at the end of each year
# backfill missing with previous non null value
df_annual = df.asfreq('A', method='ffill')
>>> df_annual
ticker open close
date
2018-12-31 AAPL 99 99
2019-12-31 AAPL 110 120
2020-12-31 AAPL 205 210
Add missing business dates in a time series, backfill them with NaN
# giving DataFrame df with DatetimeIndex
data = {'ticker': ['AAPL', 'AAPL', 'AAPL', 'AAPL'],
'open': [100, 110, 200, 205],
'close': [90, 120, 205, 210]
}
index = ['2020-04-02', '2020-04-03', '2020-04-07', '2020-04-08']
df = pd.DataFrame(data, index)
df.index.name = 'date'
>>> df
ticker open close
date
2020-04-02 AAPL 100 90
2020-04-03 AAPL 110 120
2020-04-07 AAPL 200 205
2020-04-08 AAPL 205 210
>>> df.index
DatetimeIndex(['2020-04-02', '2020-04-03', '2020-04-07', '2020-04-08'], dtype='datetime64[ns]', name='date', freq=None)
# add all missing business dates between begin/end dates (date 2020-04-06)
df_all_weekdays = pd.date_range(start=df.index.min(), end=df.index.max(), freq='B')
df = df.reindex(df_all_weekdays)
df.index.name = 'date'
df
>>> df
ticker open close
date
2020-04-02 AAPL 100.0 90.0
2020-04-03 AAPL 110.0 120.0
2020-04-06 NaN NaN NaN
2020-04-07 AAPL 200.0 205.0
2020-04-08 AAPL 205.0 210.0
Add missing business dates in a time series, backfill them with previous row’s data
# giving DataFrame df with DatetimeIndex
>>> df
ticker open close
date
2020-04-02 AAPL 100 90
2020-04-03 AAPL 110 120
2020-04-07 AAPL 200 205
2020-04-08 AAPL 205 210
>>> df.index
DatetimeIndex(['2020-04-02', '2020-04-03', '2020-04-07', '2020-04-08'], dtype='datetime64[ns]', name='date', freq=None)
# add all missing business dates between begin/end dates (date 2020-04-06)
df_all_weekdays = pd.date_range(start=df.index.min(), end=df.index.max(), freq='B')
df = df.reindex(df_all_weekdays, method='ffill')
df.index.name = 'date'
df
>>> df
ticker open close
date
2020-04-02 AAPL 100 90
2020-04-03 AAPL 110 120
2020-04-06 AAPL 110 120
2020-04-07 AAPL 200 205
2020-04-08 AAPL 205 210
Add missing business dates in a time series, backfill them with next row’s data
# giving DataFrame df with DatetimeIndex
>>> df
ticker open close
date
2020-04-02 AAPL 100 90
2020-04-03 AAPL 110 120
2020-04-07 AAPL 200 205
2020-04-08 AAPL 205 210
>>> df.index
DatetimeIndex(['2020-04-02', '2020-04-03', '2020-04-07', '2020-04-08'], dtype='datetime64[ns]', name='date', freq=None)
# add all missing business dates between begin/end dates (date 2020-04-06)
df_all_weekdays = pd.date_range(start=df.index.min(), end=df.index.max(), freq='B')
df = df.reindex(df_all_weekdays, method='bfill')
df.index.name = 'date'
df
>>> df
ticker open close
date
2020-04-02 AAPL 100 90
2020-04-03 AAPL 110 120
2020-04-06 AAPL 200 205
2020-04-07 AAPL 200 205
2020-04-08 AAPL 205 210
Find the 3rd Friday of the month i.e 2022-03
# useful in finance/trading to compute futures or options expiration dates
from datetime import datetime
# generate one month of calendar dates
one_month = pd.date_range(start=datetime(2022, 3, 1), end=datetime(2022, 3, 28), freq='D')
# keep only Fridays of the month
# fridays[2] will be third Friday
fridays = []
for dt in one_month:
if dt.weekday() == 4:
# found a Friday
fridays.append(dt)
continue
third_friday = fridays[2]
>>> third_friday.strftime('%A %Y-%m-%d')
'Friday 2022-03-18'