source('common/functions.r')
suppressPackageStartupMessages(library(rugarch))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(zoo))
library(parallel)
15 Backtesting
15.1 Libraries
15.2 Data
load('data/data.ts.RData')
=data.ts$sp500
sp500=data.ts$Ticker
Tickernames(data)
names(sp500)
NULL
- 'date'
- 'price'
- 'y'
- 'date.ts'
- 'y.ts'
- 'price.ts'
15.3 Setup
Specify the VaR probability, portfolio value, estimation window size and portfolio weight:
<- 0.01
p = 1000
value = 2000
WE =length(sp500$y)
T=T-WE
WT= rep(1/length(Ticker),length(Ticker))
w cat("p=",p,", value=",value,", WE=",WE,"T=",T,"WT=",WT,", w:",w,"\n")
p= 0.01 , value= 1000 , WE= 2000 T= 5034 WT= 3034 , w: 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667
15.4 Forecast VaR
15.5 Historical simulation — HS
=FALSE
DEBUG
=rep(NA,length=T)
violations=rep(NA,length=T)
VaRfor(i in (WE+1):T){
=i-WE
t1=i-1
t2=sp500$y[t1:t2]
dataif(DEBUG) cat(i,t1,t2,length(data),"\n")
=Risk.HS(y=data,WE=WE,p=p,value=value,Print=FALSE)$risk$VaR
VaR[i]
}plot(VaR,type='s')
If we want to do this for several methods it is better to do is all in the same loop as that minimises the chance of mistakes. This will take a long time, and we print the estimation time at the end.
=FALSE
DEBUG=WE+1
START
=Sys.time()
time1$VaR.HS=rep(NA,length=T)
sp500$VaR.HS500=rep(NA,length=T)
sp500
$VaR.EWMA=rep(NA,length=T)
sp500$VaR.GARCH=rep(NA,length=T)
sp500$VaR.tGARCH=rep(NA,length=T)
sp500
for(i in START:T){
=i-WE
t1=i-1
t2=sp500$y[t1:t2]
dataif(DEBUG) cat(i,t1,t2,"/",T,length(data),"\n")
$VaR.HS[i]=Risk.HS(y=data,WE=WE,p=p,value=value,Print=FALSE)$risk$VaR
sp500$VaR.HS500[i]=Risk.HS(y=data,WE=500,p=p,value=value,Print=FALSE)$risk$VaR
sp500$VaR.EWMA[i]=Risk.EWMA(y=data,WE=WE,p=p,value=value,Print=FALSE)$risk$VaR
sp500$VaR.GARCH[i]=Risk.GARCH(y=data,WE=WE,p=p,value=value,Print=FALSE)$risk$VaR
sp500$VaR.tGARCH[i]=Risk.tGARCH(y=data,p=p,WE=WE,value=value,Print=FALSE)$risk$VaR
sp500
}=Sys.time() time2
This will take quite some time to calculate. We could speed it up in several ways. Use multi-core calculations, and perhaps use our own algo, and feed into it as starting values the result from the previous days optimisation.
print(time2-time1)
Time difference of 7.230909 mins
At the end of this Chapter we show how multi-core calculations can be used to make it much faster.
15.5.1 Presenting results
Some columns in sp500
have VaR
in the name. We want to plot those, and can manually put those into the plot command. But it is safer to search for those columns and plot them
print(head(sp500,2))
print(names(sp500))
=grep("VaR",names(sp500))
VaRprint(VaR)
print(tail(sp500[,VaR],2))
date price y date.ts y.ts price.ts VaR.HS
2 20030103 908.59 -0.0004841496 2003-01-03 -0.0004841496 908.59 NA
3 20030106 929.01 0.0222255557 2003-01-06 0.0222255557 929.01 NA
VaR.HS500 VaR.EWMA VaR.GARCH VaR.tGARCH
2 NA NA NA NA
3 NA NA NA NA
[1] "date" "price" "y" "date.ts" "y.ts"
[6] "price.ts" "VaR.HS" "VaR.HS500" "VaR.EWMA" "VaR.GARCH"
[11] "VaR.tGARCH"
[1] 7 8 9 10 11
VaR.HS VaR.HS500 VaR.EWMA VaR.GARCH VaR.tGARCH
5034 35.758 53.222 30.682 26.042 30.241
5035 35.758 53.222 30.535 29.433 33.376
We can then plot the VaR columns.
print(head(sp500))
plot(sp500$date.ts,sp500$VaR.EWMA)
date price y date.ts y.ts price.ts VaR.HS
2 20030103 908.59 -0.0004841496 2003-01-03 -0.0004841496 908.59 NA
3 20030106 929.01 0.0222255557 2003-01-06 0.0222255557 929.01 NA
4 20030107 922.93 -0.0065661110 2003-01-07 -0.0065661110 922.93 NA
5 20030108 909.93 -0.0141857185 2003-01-08 -0.0141857185 909.93 NA
6 20030109 927.57 0.0192005899 2003-01-09 0.0192005899 927.57 NA
7 20030110 927.57 0.0000000000 2003-01-10 0.0000000000 927.57 NA
VaR.HS500 VaR.EWMA VaR.GARCH VaR.tGARCH
2 NA NA NA NA
3 NA NA NA NA
4 NA NA NA NA
5 NA NA NA NA
6 NA NA NA NA
7 NA NA NA NA
par(mar=c(3,3,1,4),mfrow=c(1,1))
matplot(sp500$date.ts,
sp500[,VaR],lty=1,
type='l',
col=c("blue","red"),
lwd=c(1,2),
las=1,
ylab="returns",
yaxt='n',
bty='l'
)=pretty(sp500$VaR.HS)
w
axis(2,at=w,label=paste0("$",w),las=1)
=rep(NA,length=T)
violations
= sp500$y < -(sp500$VaR.HS/value)
eta=eta[!is.na(eta)]
etacat(WT,length(eta),sum(eta),sum(eta)/(WT*p),"\n")
plot(sp500$VaR.HS,type='s',bty='l',las=1,col="red",main="HS VaR")
3034 3034 35 1.153593
We can superimpose the VaR onto the returns
par(mar=c(3,3,1,4))
matplot(
cbind(sp500$y,-sp500$VaR.HS/value),
lty=1,
type='l',
col=c("black","red"),
lwd=c(1,2),
las=1,
ylab="returns",
yaxt='n',
bty='u'
)=pretty(sp500$y,n=10)
waxis(2,at=w,label=paste0(100*w,"%"),las=1)
=w[w<0]
waxis(4,at=w,label=paste0("$",-value*w),las=1)
15.5.2 Zoom into Covid
We can zoom into the Covid-19 period in the first part of 2020. We filtered to the data by not a number (NA
) and also create a column,dot
, the days when VaR was violated so we can label the plot with red dots below.
=sp500
res$VaR = -res$VaR.HS/value
res=res[!is.na(res$VaR),]
res
$dot=NA
res$dot[res$y < res$VaR] = res$y[ res$y < res$VaR] res
There are two main ways we can do the filter by date, either by converting the data into zoo
object or simply directly by using the date range. We follow the second way below, but both would work. Note that the code would be different depending on the way we chose.
The zoo
and window
way.
res=zoo(res,order.by=res$date.ts)
res=window(res,start=ymd(20200201),end=ymd(20200701))
=res[res$date.ts>=ymd(20200101) &res$date.ts<=ymd(20200701), ] res
We then made the main plot.
par(mar=c(3,4,1,6))
matplot(res$date.ts,res[,c("y","VaR")],
main="Risk in Covid (HS VaR)",
lty=1,
type='l',
col=c("black","red"),
lwd=c(1,2),
las=1,
ylab="returns",
yaxt='n'
)
points(res$date.ts,res$dot,pch=16,col="red",cex=1.5)
=pretty(res$y,n=5)
waxis(2,at=w,label=paste0(value*w,"%"),las=1)
=w[w<0]
waxis(4,at=w,label=paste0("$",-value*w),las=1)
mtext("return",2,outer = TRUE)
mtext("VaR",4,outer = FALSE)
15.6 Using many cores for the calculations
The calculation of the backtests took a long time. Since every computer has multiple cores, we can run the calculations on all the cores at the same time. While that does not work of every type of calculation, the backtests are well suited for it.
Start by making a function that does the calculations for each day and returns of vector with the risk forecasts.
=function(i){
tmpfn=i-WE
t1=i-1
t2=sp500$y[t1:t2]
data=vector(length=5)
res1]=Risk.HS(y=data,WE=WE,p=p,value=value,Print=FALSE)$risk$VaR
res[2]=Risk.HS(y=data,WE=500,p=p,value=value,Print=FALSE)$risk$VaR
res[3]=Risk.EWMA(y=data,WE=WE,p=p,value=value,Print=FALSE)$risk$VaR
res[4]=Risk.GARCH(y=data,WE=WE,p=p,value=value,Print=FALSE)$risk$VaR
res[5]=Risk.tGARCH(y=data,WE=WE,p=p,value=value,Print=FALSE)$risk$VaR
res[return(res)
}=tmpfn(2000) x
There are several ways we can use to execute multiple calculations in parallel in R. We start by loading the library parallel
. The way forward depends whether we are on a Windows machine or a unix computer, like Linux or Mac. If on Windows, we need to use commands like do for
, while it is easier to use mclapply
under unix.
Just like in the example above, we loop over observations START:T
. We also need to set the number of course we what do use for the calculation.
=Sys.time()
time1=mclapply(START:T, FUN=tmpfn, mc.cores =8)
res=Sys.time()
time2print(time2-time1)
Time difference of 1.575405 mins
This takes about 1/6 of the time the single core calculation above took, not 1/8, as there are overheads with multi core calculations.
mclapply
returns a list, with one entry per day, each with five risk calculations.
length(res)
class(res)
1]
res[1]] res[[
-
- 42.593
- 69.482
- 20.529
- 20.513
- 22.12
- 42.593
- 69.482
- 20.529
- 20.513
- 22.12
We can convert the list into a long vector, using unlist
, where the first five elements are the first five risk estimates, etc., and then converting that into a matrix with five columns, and one row a day. After converting the matrix into a dataframe, we can then name the columns.
=unlist(res)
res.ullength(res.ul)
length(res[[1]])
=matrix(unlist(res),ncol=length(res[[1]]),byrow=TRUE)
res.m=as.data.frame(res.m)
res.dfdim(res.df)
names(res.df)=c("VaR.HS","VaR.HS500","VaR.EWMA","VaR.GARCH","VaR.tGARCH")
head(res.df,2)
- 3034
- 5
VaR.HS | VaR.HS500 | VaR.EWMA | VaR.GARCH | VaR.tGARCH | |
---|---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
1 | 42.593 | 69.482 | 20.529 | 20.513 | 22.120 |
2 | 42.593 | 69.482 | 20.194 | 19.821 | 21.376 |
The first risk estimate is for day START
, and we want to create a data frame with the same number of rows as the data, and we do that by filling it with rows containing only NA
.
= t(data.frame(rep(NA,length(res[[1]]))))
df =as.data.frame(df[rep(seq_len(nrow(df)), each = START-1), ])
dfclass(df)
names(df)=names(res.df)
=rbind(df,res.df)
res.df.fullrownames(res.df.full)=NULL
print(rbind(head(res.df.full,1),tail(res.df.full,1)))
dim(res.df.full)
dim(sp500)
VaR.HS VaR.HS500 VaR.EWMA VaR.GARCH VaR.tGARCH
1 NA NA NA NA NA
5034 35.758 53.222 30.535 29.433 33.376
- 5034
- 5
- 5034
- 11
We then want to make a matrix with returns and the risk estimates, one that looks the same as what we did above. In this case, we do that by copying the first six columns of the sp500
into a new dataframe called sp500.parallel
, and using cbind
to append the risk columns to it.
=sp500[,c("date","price","y","date.ts","y.ts","price.ts")]
sp500.parallel=cbind(sp500.parallel,res.df.full)
sp500.paralleltail(sp500.parallel)
dim(sp500.parallel)
dim(sp500)
date | price | y | date.ts | y.ts | price.ts | VaR.HS | VaR.HS500 | VaR.EWMA | VaR.GARCH | VaR.tGARCH | |
---|---|---|---|---|---|---|---|---|---|---|---|
<int> | <dbl> | <dbl> | <date> | <zoo> | <zoo> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
5030 | 20221222 | 3822.39 | -0.014557130 | 2022-12-22 | -0.014557130 | 3822.39 | 35.758 | 53.222 | 32.131 | 29.006 | 33.374 |
5031 | 20221223 | 3844.82 | 0.005850959 | 2022-12-23 | 0.005850959 | 3844.82 | 35.758 | 53.222 | 32.267 | 30.275 | 34.545 |
5032 | 20221227 | 3829.25 | -0.004057852 | 2022-12-27 | -0.004057852 | 3829.25 | 35.758 | 53.222 | 32.365 | 27.608 | 32.017 |
5033 | 20221228 | 3783.22 | -0.012093463 | 2022-12-28 | -0.012093463 | 3783.22 | 35.758 | 53.222 | 31.556 | 25.106 | 29.559 |
5034 | 20221229 | 3849.28 | 0.017310619 | 2022-12-29 | 0.017310619 | 3849.28 | 35.758 | 53.222 | 30.682 | 26.042 | 30.241 |
5035 | 20221230 | 3839.50 | -0.002543968 | 2022-12-30 | -0.002543968 | 3839.50 | 35.758 | 53.222 | 30.535 | 29.433 | 33.376 |
- 5034
- 11
- 5034
- 11
As a final assurance, we test to see if the two dataframes are identical.
identical(sp500,sp500.parallel)
15.7 Exercise
Conduct Bernoulli Coverage tests and Independence tests for the above backtesting procedure. Interpret the results.
# Answer
=matrix(NA,5,WT)
p.matfor (i in 1:5){
=sp500$y.ts < -(sp500[,i+6]/value)
tmp=tmp[!is.na(tmp)]
tmp=tmp
p.mat[i,]#eta[i]=sum(tmp)
}=1*p.mat
p.matrowSums(p.mat)
= function(p,v){
bern_test =p^v*(1 - p)^(WT-v)
a= (v/WT)^v*(1 - v/WT)^(WT-v)
b return(-2*log(a/b))
}=bern_test(p,rowSums(p.mat))
bern.rst>qchisq(1-p, df=1)
bern.rst
= function(V){
ind_test = matrix(ncol = 4,nrow = length(V))
J for (i in 2:length(V)){
1] = V[i - 1] == 0 & V[i] == 0
J[i,2] = V[i - 1] ==0 & V[i] == 1
J[i,3] = V[i - 1] == 1 & V[i] == 0
J[i,4] = V[i - 1] == 1 & V[i] == 1
J[i,
} = sum(J[,1],na.rm = TRUE)
V_00 = sum(J[,2],na.rm = TRUE)
V_01 = sum(J[,3],na.rm = TRUE)
V_10 = sum(J[,4],na.rm = TRUE)
V_11 = V_00/(V_00 + V_01)
p_00 = V_01/(V_00 + V_01)
p_01 = V_10/(V_10 + V_11)
p_10 = V_11/(V_10 + V_11)
p_11 = (V_01 + V_11)/(V_00 + V_01 + V_10 + V_11)
hat_p = (1 - hat_p)^(V_00 + V_10)*(hat_p)^(V_01 + V_11)
a = (p_00)^(V_00)*(p_01)^(V_01)*(p_10)^(V_10)* p_11^(V_11)
b return(-2 * log(a/b))
}
=apply(p.mat,1,ind_test)
ind.rst>qchisq(1-p, df=1)
ind.rst
# Coverage test: reject null (p matches the observed number of violations) for HS500, EWMA, GARCH.
# Independence test: fail to reject null (two violations do not follow each other) for models.
- 35
- 9
- 77
- 69
- 44
- FALSE
- TRUE
- TRUE
- TRUE
- FALSE
- FALSE
- FALSE
- FALSE
- FALSE
- FALSE