../_images/search-banner2.png

Time Series Data

pandas contains extensive capabilities and features for working with time series data for all domains. NumPy’s dtypes datetime64 and timedelta64 are used for manipulating time series data.

Create datetime object with python’s native method

from datetime import datetime
date = datetime(year=2010, month=7, day=10, hour=10, minute=50)

>>> date
datetime.datetime(2010, 7, 10, 10, 50)

Create datetime object with python’s native method

from dateutil import parser
date = parser.parse("10th of July, 2020 10:50")

>>> date
datetime.datetime(2020, 7, 10, 10, 50)

Extract date, time parts from datetime

from datetime import datetime
date = datetime(year=2010, month=7, day=10, hour=10, minute=50)

# format can be any code of strftime directive: http://strftime.org/
>>> date.strftime('%A')
'Saturday'

>>> date.strftime('%B')
'July'

>>> date.strftime('%H:%M')
'10:50'

Create datetime object with numpy (datetime64)

import numpy as np
date1 = np.array('2020-07-10', dtype=np.datetime64)
date2 = np.datetime64('2020-07-10 10:50')

>>> date1
array('2020-07-10', dtype='datetime64[D]')

>>> date2
numpy.datetime64('2020-07-10T10:50')

Create datetime object with pandas (Timestamp)

import pandas as pd
date = pd.to_datetime("10th of July, 2020 10:50")

# returns Timestamp object b/c we passed a single date to parse
>>> date
Timestamp('2020-07-10 10:50:00')

Create datetime object with pandas (datetime64)

import datetime as datetime
dti = pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01'), datetime.datetime(2018, 1, 1)])

# returns DatetimeIndex object b/c we passed multiple dates to parse
>>> dti
DatetimeIndex(['2018-01-01', '2018-01-01', '2018-01-01'], dtype='datetime64[ns]', freq=None)

Create DataFrame with DatetimeIndex as index

data = {'ticker': ['AAPL', 'AAPL', 'AAPL', 'AAPL'],
        'open': [100, 110, 200, 205],
        'close': [90, 120, 205, 210]
       }
index = pd.DatetimeIndex(['2019-03-02', '2019-04-03', '2020-04-07', '2020-04-08'])
df = pd.DataFrame(data, index)
df.index.name = 'date'

>>> df
           ticker  open  close
date
2019-03-02   AAPL   100     90
2019-04-03   AAPL   110    120
2020-04-07   AAPL   200    205
2020-04-08   AAPL   205    210

>>> df.index
DatetimeIndex(['2019-03-02', '2019-04-03', '2020-04-07', '2020-04-08'], dtype='datetime64[ns]', name='date', freq=None)

Slice DataFrame with DatetimeIndex on specific year/month/period

data = {'ticker': ['AAPL', 'AAPL', 'AAPL', 'AAPL'],
        'open': [100, 110, 200, 205],
        'close': [90, 120, 205, 210]
       }
index = pd.DatetimeIndex(['2019-03-02', '2019-04-03', '2020-04-07', '2020-04-08'])
df = pd.DataFrame(data, index)
df.index.name = 'date'

>>> df
           ticker  open  close
date
2019-03-02   AAPL   100     90
2019-04-03   AAPL   110    120
2020-04-07   AAPL   200    205
2020-04-08   AAPL   205    210

# 2019 rows only
>>> df['2019']
           ticker  open  close
date
2019-03-02   AAPL   100     90
2019-04-03   AAPL   110    120

# 2019 April rows only
>>> df['2019-04']
           ticker  open  close
date
2019-04-03   AAPL   110    120

# select rows for a specific time period
>>> df['2019-04-03':'2020-04-07']
           ticker  open  close
date
2019-04-03   AAPL   110    120
2020-04-07   AAPL   200    205

Resample time series with aggregation

data = {'ticker': ['AAPL', 'AAPL', 'AAPL', 'AAPL'],
        'open': [100, 110, 200, 205],
        'close': [90, 120, 205, 210]
       }
index = pd.DatetimeIndex(['2019-04-02', '2019-12-31', '2020-04-07', '2020-12-31'])
df = pd.DataFrame(data, index)
df.index.name = 'date'

# resample to annual frequency averaging values for each year
df_annual = df.resample('A').mean()

>>> df_annual
             open  close
date
2019-12-31  105.0  105.0
2020-12-31  202.5  207.5

Resample time series with selection

data = {'ticker': ['AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL'],
        'open': [99, 100, 110, 200, 205],
        'close': [99, 90, 120, 205, 210]
       }
index = pd.DatetimeIndex(['2018-04-02', '2019-04-02', '2019-12-31', '2020-04-07', '2020-12-31'])
df = pd.DataFrame(data, index)
df.index.name = 'date'

# resample to annual frequency selecting the values at the end of each year
# note the default NaN when no data for end of year 2018
df_annual = df.asfreq('A')

>>> df_annual
           ticker   open  close
date
2018-12-31    NaN    NaN    NaN
2019-12-31   AAPL  110.0  120.0
2020-12-31   AAPL  205.0  210.0

# resample to annual frequency selecting the values at the end of each year
# backfill missing with previous non null value
df_annual = df.asfreq('A', method='ffill')

>>> df_annual
           ticker  open  close
date
2018-12-31   AAPL    99     99
2019-12-31   AAPL   110    120
2020-12-31   AAPL   205    210

Add missing business dates in a time series, backfill them with NaN

# giving DataFrame df with DatetimeIndex
data = {'ticker': ['AAPL', 'AAPL', 'AAPL', 'AAPL'],
        'open': [100, 110, 200, 205],
        'close': [90, 120, 205, 210]
       }
index = ['2020-04-02', '2020-04-03', '2020-04-07', '2020-04-08']
df = pd.DataFrame(data, index)
df.index.name = 'date'

>>> df
           ticker  open  close
date
2020-04-02   AAPL   100     90
2020-04-03   AAPL   110    120
2020-04-07   AAPL   200    205
2020-04-08   AAPL   205    210
>>> df.index
DatetimeIndex(['2020-04-02', '2020-04-03', '2020-04-07', '2020-04-08'], dtype='datetime64[ns]', name='date', freq=None)

# add all missing business dates between begin/end dates (date 2020-04-06)
df_all_weekdays = pd.date_range(start=df.index.min(), end=df.index.max(), freq='B')
df = df.reindex(df_all_weekdays)
df.index.name = 'date'
df

>>> df
           ticker   open  close
date
2020-04-02   AAPL  100.0   90.0
2020-04-03   AAPL  110.0  120.0
2020-04-06    NaN    NaN    NaN
2020-04-07   AAPL  200.0  205.0
2020-04-08   AAPL  205.0  210.0

Add missing business dates in a time series, backfill them with previous row’s data

# giving DataFrame df with DatetimeIndex
>>> df
           ticker  open  close
date
2020-04-02   AAPL   100     90
2020-04-03   AAPL   110    120
2020-04-07   AAPL   200    205
2020-04-08   AAPL   205    210
>>> df.index
DatetimeIndex(['2020-04-02', '2020-04-03', '2020-04-07', '2020-04-08'], dtype='datetime64[ns]', name='date', freq=None)

# add all missing business dates between begin/end dates (date 2020-04-06)
df_all_weekdays = pd.date_range(start=df.index.min(), end=df.index.max(), freq='B')
df = df.reindex(df_all_weekdays, method='ffill')
df.index.name = 'date'
df

>>> df
           ticker  open  close
date
2020-04-02   AAPL   100     90
2020-04-03   AAPL   110    120
2020-04-06   AAPL   110    120
2020-04-07   AAPL   200    205
2020-04-08   AAPL   205    210

Add missing business dates in a time series, backfill them with next row’s data

# giving DataFrame df with DatetimeIndex
>>> df
           ticker  open  close
date
2020-04-02   AAPL   100     90
2020-04-03   AAPL   110    120
2020-04-07   AAPL   200    205
2020-04-08   AAPL   205    210
>>> df.index
DatetimeIndex(['2020-04-02', '2020-04-03', '2020-04-07', '2020-04-08'], dtype='datetime64[ns]', name='date', freq=None)

# add all missing business dates between begin/end dates (date 2020-04-06)
df_all_weekdays = pd.date_range(start=df.index.min(), end=df.index.max(), freq='B')
df = df.reindex(df_all_weekdays, method='bfill')
df.index.name = 'date'
df

>>> df
           ticker  open  close
date
2020-04-02   AAPL   100     90
2020-04-03   AAPL   110    120
2020-04-06   AAPL   200    205
2020-04-07   AAPL   200    205
2020-04-08   AAPL   205    210