library(lubridate)
library(zoo)
library(reshape2)
source("common/functions.r",chdir=TRUE)
8 Time series
Almost all the data we use here is associated with a particular point in time, like the price of a stock on a given day. That is called a time series. However, it isn’t easy to work with time series as one has to keep track of the day, month and year and know about leap years, time zones, holidays and other market closures. Consequently, it is best to work with data as if it were not a time series and only turn it into a time series when needed, typically for plotting, reporting and aggregation.
Most financial applications involve working with dates. These can be monthly, weekly, daily, or even intraday data. Storing data as text is not helpful since we cannot easily order or subset it.
R has a specific data type called Date
. In this section, we will explore some packages that help us work with Date
objects.
8.1 Libraries
8.2 Loading data
=ProcessRawData()
data=data$sp500
sp500=data$sp500tr
sp500tr=data$Price
Price=data$Return
Return=data$UnAdjustedPrice
UnAdjustedPrice=data$Ticker Ticker
8.3 Plotting time series
Start by plotting the SP-500 with the improvements from Section Section 7.3.2:
par(mar=c(3,4.2,1,0))
plot(sp500$price,
type='l',
lwd=2,
col='blue',
las=1,
bty='l',
xlab="day",
ylab='price',
main="The SP-500 index"
)
8.4 lubridate
We use the lubridate
package to convert numbers and strings into dates. ymd()
stands for year-month-day. If we have data with the American date convention, we can use mdy()
, and in some cases we have ymd()
formatted dates.
lubridate
handles both string labels, JAN and integer 01
.
ymd(20200110)
[1] "2020-01-10"
ymd("20200110")
[1] "2020-01-10"
class(ymd("20200110"))
[1] "Date"
ymd("2015JAN11")
[1] "2015-01-11"
class(ymd("20200110"))
[1] "Date"
ymd("04-MAR-5")
[1] "2004-03-05"
class(ymd("04MAR5"))
[1] "Date"
dmy("1/june/2019")
[1] "2019-06-01"
class(dmy("1/june/2019"))
[1] "Date"
dmy("28-december-14")
[1] "2014-12-28"
class(dmy("28-december-14"))
[1] "Date"
We can use lubridate
to make a proper date column for the SP-500.
$date.ts=ymd(sp500$date)
sp500tail(sp500,2)
date | price | y | date.ts | |
---|---|---|---|---|
5034 | 20221229 | 3849.28 | 0.0173106 | 2022-12-29 |
5035 | 20221230 | 3839.50 | -0.0025440 | 2022-12-30 |
8.5 Plotting with dates
Now, we can make a time series plot. Note that in sp500$date.ts,sp500$price
, we use sp500$date.ts
for the x-axis and sp500$price
for the y-axis.
par(mar=c(2,4,1,0))
plot(sp500$date.ts,sp500$price,
type='l',
lwd=2,
col='blue',
las=1,
bty='l',
xlab="Day",
ylab='Price',
main="The SP-500 index"
)
You can make it a log plot
par(mar=c(2,4,1,0))
plot(sp500$date.ts,sp500$price,
type='l',
lwd=2,
col='blue',
las=1,
bty='l',
xlab="day",
ylab='price',
main="The SP-500 index",
log='y'
)
We can customise this a bit more and sub-tickmarks.
8.6 The zoo
package
What we did above was plot a price vector against a date vector. We can also directly associate dates to prices with the zoo
package, which allows us to work with ordered date-indexed observations. That allows many useful operations.
8.6.1 Make a zoo
$y.ts = zoo(sp500$y, order.by = sp500$date.ts)
sp500$price.ts = zoo(sp500$price, order.by = sp500$date.ts)
sp500class(sp500$y.ts)
[1] "zoo"
head(sp500$y.ts)
2003-01-03 2003-01-06 2003-01-07 2003-01-08 2003-01-09
-0.0004841496 0.0222255557 -0.0065661110 -0.0141857185 0.0192005899
2003-01-10
0.0000000000
Then, we can plot it directly as a time series.
plot(sp500$y.ts,
main="S&P500 Daily Return",
ylab="Return"
)
We can do useful things with zoo data.
8.6.2 lag
function
This function allows us to take the lag or leads of a time series object. The syntax is: lag(x, k, na.pad = F)
, where:
- x, a time series object to lag
- k, number of lags (in units of observations); could be positive or negative (if negative, k is the number of forward lags)
na.pad
, addsNAs
for missing observations ifTRUE
head(sp500$y.ts)
2003-01-03 2003-01-06 2003-01-07 2003-01-08 2003-01-09
-0.0004841496 0.0222255557 -0.0065661110 -0.0141857185 0.0192005899
2003-01-10
0.0000000000
head(lag(sp500$y.ts, k = 2))
2003-01-03 2003-01-06 2003-01-07 2003-01-08 2003-01-09 2003-01-10
-0.006566111 -0.014185718 0.019200590 0.000000000 -0.001413291 0.005812968
8.6.3 diff
function
Takes the lagged difference of a time series. Syntax: diff(x, lag, differences, na.pad = F)
, where:
- x = a time series object
- lag = number of lags(in unit of observations)
- differences = the order of the difference
head(diff(sp500$y.ts, lag = 1, na.pad = TRUE))
2003-01-03 2003-01-06 2003-01-07 2003-01-08 2003-01-09 2003-01-10
NA 0.022709705 -0.028791667 -0.007619607 0.033386308 -0.019200590
8.6.4 The window
function
We can use the window()
function to subset a zoo object to a given time period. For example, let’s say we are interested in the returns during the Covid-19 crisis:
par(mar=c(2,4,1,0))
= window(sp500$y.ts, start = ymd("20200201"), end = ymd("20200401"))
sub_y.ts plot(sub_y.ts,
main = "Returns in Covid",
xlab = "Date",
ylab = "Returns",
col = "mediumblue",
lwd=2
)
8.6.5 Aggregate
We often need to aggregate time series data. For example, we may want to calculate end-of-month prices or realised monthly variance. The aggregate
function makes that easy.
=aggregate(sp500$price.ts,as.yearmon,tail,1)
p.monthlyhead(p.monthly,5)
Jan 2003 Feb 2003 Mar 2003 Apr 2003 May 2003
855.70 841.15 848.18 916.92 963.59
=aggregate(sp500$y.ts,as.yearmon,sd)
realized.variancehead(realized.variance,5)
Jan 2003 Feb 2003 Mar 2003 Apr 2003 May 2003
0.01435287 0.01188719 0.01747653 0.01169788 0.01026536
=aggregate(sp500$y.ts,as.yearmon,mean) p.monthly.mean
par(mar=c(4,4,1,0.6))
plot(p.monthly.mean,realized.variance,
bty='l',
main="SP-500 monthly mean and volatility",
col='red',
pch=16,
xlab="mean",
ylab="volatility",
xaxt='n',
yaxt='n'
)=pretty(p.monthly.mean)
waxis(1,w,label=paste0(100*w,"%"))
=pretty(realized.variance)
waxis(2,w,label=paste0(100*w,"%"),las=1)
=lm(realized.variance ~ p.monthly.mean)
regression_lineabline(regression_line,col='green',lwd=3)
8.7 Multivariate plots
We use the matplot
command for many assets. Call the list of assets Assets
:
par(mar=c(2,4,0,0))
matplot(Price[,Ticker])
This is quite ugly and can be made to look better.
par(mar=c(2,4,0,0))
matplot(
Price[,Ticker],type='l',
lty=1,
ylab='Price'
)
We can add a date to it the same way we did before.
$date.ts=ymd(Price$date)
Price$date.ts=ymd(Return$date)
Return$date.ts=ymd(UnAdjustedPrice$date) UnAdjustedPrice
par(mar=c(2,4,0,0))
matplot(
$date.ts,
Price
Price[,Ticker],type='l',
lty=1,
ylab='Price'
)
We can put a legend on the plot.
par(mar=c(2,4,0,0))
matplot(
$date.ts,
Price
Price[,Ticker],type='l',
lty=1,
ylab='Price',
col=1:6,
las=1
)legend("topleft",legend=Ticker,lty=1,col=1:6,bty='n',ncol=2)
In order to compare the performance of the stocks, we can re-normalise them to start at 1
par(mar=c(2,4,0,0))
=Price
pnfor(i in Ticker){
=pn[[i]]/pn[[i]][1]
pn[[i]]
}matplot(
$date.ts,
pn
pn[,Ticker],type='l',
lty=1,
ylab='Price',
col=1:6,
las=1
)legend("topleft",legend=Ticker,lty=1,col=1:6,bty='n',ncol=2)
rbind(head(pn,2),tail(pn,1))
date | AAPL | DIS | GE | INTC | JPM | MCD | date.ts | |
---|---|---|---|---|---|---|---|---|
2 | 20030103 | 1.0000 | 1.000000 | 1.0000000 | 1.000000 | 1.000000 | 1.000000 | 2003-01-03 |
3 | 20030106 | 1.0000 | 1.051263 | 1.0255903 | 1.038697 | 1.078639 | 1.032873 | 2003-01-06 |
5035 | 20221230 | 572.7678 | 6.279998 | 0.7411147 | 2.692354 | 8.998689 | 27.909514 | 2022-12-30 |
Log scaling the y-axis can be more informative.
par(mar=c(2,4,0.5,0))
=Price
pnfor(i in Ticker){
=pn[[i]]/pn[[i]][1]
pn[[i]]
}matplot(
$date.ts,
pn
pn[,Ticker],type='l',
lty=1,
ylab='Price',
col=1:6,
las=1,
log='y'
)legend("topleft",legend=Ticker,lty=1,col=1:6,bty='n',ncol=2)
And put gridlines on it.
par(mar=c(2,4,0.5,0))
=Price
pnfor(i in Ticker){
=pn[[i]]/pn[[i]][1]
pn[[i]]
}matplot(
$date.ts,
pn
pn[,Ticker],type='l',
lty=1,
ylab='Price',
col=1:6,
las=1,
log='y'
)for(i in c(0.5,1,5,10,50,100,500,1000))
segments(pn$date.ts[1]-days(500),i,tail(pn$date.ts,1)+days(500),i,col="lightgray")
legend("topleft",legend=Ticker,lty=1,col=1:6,bty='n',ncol=2)
Based on this, the best-performing stock is AAPL.
8.8 ProcessRawData()
Even better is to put all variables into the same list and load them all with ProcessRawData()
. We put that into functions.r
.
=function(){
ProcessRawData
=read.csv('data/sp500.csv')
sp500names(sp500)[2]="price"
$y = c(NA,diff(log(sp500$price)))
sp500=sp500[!is.na(sp500$y),]
sp500$date.ts = ymd(sp500$date)
sp500$y.ts = zoo(sp500$y,order.by=sp500$date.ts)
sp500
=read.csv('data/sp500tr.csv')
sp500trnames(sp500tr)[2]="price"
$y = c(NA,diff(log(sp500tr$price)))
sp500tr$date.ts = ymd(sp500tr$date)
sp500tr$y.ts = zoo(sp500tr$y,order.by=sp500tr$date.ts)
sp500tr
=read.csv('data/stocks.csv')
stocksnames(stocks)[3:4]=c("UnAdjustedPrice","price")
= dcast(stocks, date ~ ticker, value.var = "price")
Price head(Price,2)
= dcast(stocks, date ~ ticker, value.var = "UnAdjustedPrice")
UnAdjustedPrice
=Price
Returnfor (i in 2:dim(Price)[2]) Return[,i]=c(NA,diff(log(Price[,i])))
=Price[!is.na(Return[,2]),]
Price=UnAdjustedPrice[!is.na(Return[,2]),]
UnAdjustedPrice=Return[!is.na(Return[,2]),]
Return
=list(
dataReturn=Return,
Price=Price,
UnAdjustedPrice=UnAdjustedPrice,
sp500=sp500,
sp500tr=sp500tr,
Ticker=unique(stocks$ticker)
)
return(data)
}