15  Backtesting

15.1 Libraries

source('common/functions.r')
suppressPackageStartupMessages(library(rugarch))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(zoo))
library(parallel)

15.2 Data

load('data/data.ts.RData')
sp500=data.ts$sp500
Ticker=data.ts$Ticker
names(data)
names(sp500)
NULL
  1. 'date'
  2. 'price'
  3. 'y'
  4. 'date.ts'
  5. 'y.ts'
  6. 'price.ts'

15.3 Setup

Specify the VaR probability, portfolio value, estimation window size and portfolio weight:

p <- 0.01
value = 1000
WE = 2000
T=length(sp500$y)
WT=T-WE
w = rep(1/length(Ticker),length(Ticker))
cat("p=",p,", value=",value,", WE=",WE,"T=",T,"WT=",WT,", w:",w,"\n")
p= 0.01 , value= 1000 , WE= 2000 T= 5034 WT= 3034 , w: 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 

15.4 Forecast VaR

15.5 Historical simulation — HS

DEBUG=FALSE

violations=rep(NA,length=T)
VaR=rep(NA,length=T)
for(i in (WE+1):T){
    t1=i-WE
    t2=i-1
    data=sp500$y[t1:t2]
    if(DEBUG) cat(i,t1,t2,length(data),"\n")
    VaR[i]=Risk.HS(y=data,WE=WE,p=p,value=value,Print=FALSE)$risk$VaR

}
plot(VaR,type='s')

If we want to do this for several methods it is better to do is all in the same loop as that minimises the chance of mistakes. This will take a long time, and we print the estimation time at the end.

DEBUG=FALSE
START=WE+1

time1=Sys.time()
sp500$VaR.HS=rep(NA,length=T)
sp500$VaR.HS500=rep(NA,length=T)

sp500$VaR.EWMA=rep(NA,length=T)
sp500$VaR.GARCH=rep(NA,length=T)
sp500$VaR.tGARCH=rep(NA,length=T)

for(i in START:T){
    t1=i-WE
    t2=i-1
    data=sp500$y[t1:t2]
    if(DEBUG) cat(i,t1,t2,"/",T,length(data),"\n")
    sp500$VaR.HS[i]=Risk.HS(y=data,WE=WE,p=p,value=value,Print=FALSE)$risk$VaR
    sp500$VaR.HS500[i]=Risk.HS(y=data,WE=500,p=p,value=value,Print=FALSE)$risk$VaR
    sp500$VaR.EWMA[i]=Risk.EWMA(y=data,WE=WE,p=p,value=value,Print=FALSE)$risk$VaR
    sp500$VaR.GARCH[i]=Risk.GARCH(y=data,WE=WE,p=p,value=value,Print=FALSE)$risk$VaR
    sp500$VaR.tGARCH[i]=Risk.tGARCH(y=data,p=p,WE=WE,value=value,Print=FALSE)$risk$VaR
}
time2=Sys.time()

This will take quite some time to calculate. We could speed it up in several ways. Use multi-core calculations, and perhaps use our own algo, and feed into it as starting values the result from the previous days optimisation.

print(time2-time1)
Time difference of 7.230909 mins

At the end of this Chapter we show how multi-core calculations can be used to make it much faster.

15.5.1 Presenting results

Some columns in sp500 have VaR in the name. We want to plot those, and can manually put those into the plot command. But it is safer to search for those columns and plot them

print(head(sp500,2))
print(names(sp500))
VaR=grep("VaR",names(sp500))
print(VaR)
print(tail(sp500[,VaR],2))
      date  price             y    date.ts          y.ts price.ts VaR.HS
2 20030103 908.59 -0.0004841496 2003-01-03 -0.0004841496   908.59     NA
3 20030106 929.01  0.0222255557 2003-01-06  0.0222255557   929.01     NA
  VaR.HS500 VaR.EWMA VaR.GARCH VaR.tGARCH
2        NA       NA        NA         NA
3        NA       NA        NA         NA
 [1] "date"       "price"      "y"          "date.ts"    "y.ts"      
 [6] "price.ts"   "VaR.HS"     "VaR.HS500"  "VaR.EWMA"   "VaR.GARCH" 
[11] "VaR.tGARCH"
[1]  7  8  9 10 11
     VaR.HS VaR.HS500 VaR.EWMA VaR.GARCH VaR.tGARCH
5034 35.758    53.222   30.682    26.042     30.241
5035 35.758    53.222   30.535    29.433     33.376

We can then plot the VaR columns.

print(head(sp500))
plot(sp500$date.ts,sp500$VaR.EWMA)
      date  price             y    date.ts          y.ts price.ts VaR.HS
2 20030103 908.59 -0.0004841496 2003-01-03 -0.0004841496   908.59     NA
3 20030106 929.01  0.0222255557 2003-01-06  0.0222255557   929.01     NA
4 20030107 922.93 -0.0065661110 2003-01-07 -0.0065661110   922.93     NA
5 20030108 909.93 -0.0141857185 2003-01-08 -0.0141857185   909.93     NA
6 20030109 927.57  0.0192005899 2003-01-09  0.0192005899   927.57     NA
7 20030110 927.57  0.0000000000 2003-01-10  0.0000000000   927.57     NA
  VaR.HS500 VaR.EWMA VaR.GARCH VaR.tGARCH
2        NA       NA        NA         NA
3        NA       NA        NA         NA
4        NA       NA        NA         NA
5        NA       NA        NA         NA
6        NA       NA        NA         NA
7        NA       NA        NA         NA

par(mar=c(3,3,1,4),mfrow=c(1,1))


matplot(sp500$date.ts,
    sp500[,VaR],
    lty=1,
    type='l',
    col=c("blue","red"),
    lwd=c(1,2),
    las=1,
    ylab="returns",
    yaxt='n',
    bty='l'
)
w=pretty(sp500$VaR.HS)

axis(2,at=w,label=paste0("$",w),las=1)

violations=rep(NA,length=T)

eta= sp500$y < -(sp500$VaR.HS/value)
eta=eta[!is.na(eta)]
cat(WT,length(eta),sum(eta),sum(eta)/(WT*p),"\n")
plot(sp500$VaR.HS,type='s',bty='l',las=1,col="red",main="HS VaR")
3034 3034 35 1.153593 

We can superimpose the VaR onto the returns

par(mar=c(3,3,1,4))
matplot(
    cbind(sp500$y,-sp500$VaR.HS/value),
    lty=1,
    type='l',
    col=c("black","red"),
    lwd=c(1,2),
    las=1,
    ylab="returns",
    yaxt='n',
    bty='u'
)
w=pretty(sp500$y,n=10)
axis(2,at=w,label=paste0(100*w,"%"),las=1)
w=w[w<0]
axis(4,at=w,label=paste0("$",-value*w),las=1)

15.5.2 Zoom into Covid

We can zoom into the Covid-19 period in the first part of 2020. We filtered to the data by not a number (NA) and also create a column,dot, the days when VaR was violated so we can label the plot with red dots below.

res=sp500
res$VaR = -res$VaR.HS/value
res=res[!is.na(res$VaR),]

res$dot=NA
res$dot[res$y < res$VaR] = res$y[ res$y < res$VaR]

There are two main ways we can do the filter by date, either by converting the data into zoo object or simply directly by using the date range. We follow the second way below, but both would work. Note that the code would be different depending on the way we chose.

The zoo and window way.

res=zoo(res,order.by=res$date.ts)
res=window(res,start=ymd(20200201),end=ymd(20200701))
res=res[res$date.ts>=ymd(20200101) &res$date.ts<=ymd(20200701), ]

We then made the main plot.

par(mar=c(3,4,1,6))
matplot(res$date.ts,res[,c("y","VaR")],
     main="Risk in Covid (HS VaR)",
    lty=1,
    type='l',
    col=c("black","red"),
    lwd=c(1,2),
    las=1,
    ylab="returns",
    yaxt='n'
)

points(res$date.ts,res$dot,pch=16,col="red",cex=1.5)

w=pretty(res$y,n=5)
axis(2,at=w,label=paste0(value*w,"%"),las=1)
w=w[w<0]
axis(4,at=w,label=paste0("$",-value*w),las=1)
mtext("return",2,outer = TRUE)
mtext("VaR",4,outer = FALSE)

15.6 Using many cores for the calculations

The calculation of the backtests took a long time. Since every computer has multiple cores, we can run the calculations on all the cores at the same time. While that does not work of every type of calculation, the backtests are well suited for it.

Start by making a function that does the calculations for each day and returns of vector with the risk forecasts.

tmpfn=function(i){
    t1=i-WE
    t2=i-1
    data=sp500$y[t1:t2]
    res=vector(length=5)
    res[1]=Risk.HS(y=data,WE=WE,p=p,value=value,Print=FALSE)$risk$VaR
    res[2]=Risk.HS(y=data,WE=500,p=p,value=value,Print=FALSE)$risk$VaR
    res[3]=Risk.EWMA(y=data,WE=WE,p=p,value=value,Print=FALSE)$risk$VaR
    res[4]=Risk.GARCH(y=data,WE=WE,p=p,value=value,Print=FALSE)$risk$VaR
    res[5]=Risk.tGARCH(y=data,WE=WE,p=p,value=value,Print=FALSE)$risk$VaR
    return(res)
}
x=tmpfn(2000)

There are several ways we can use to execute multiple calculations in parallel in R. We start by loading the library parallel. The way forward depends whether we are on a Windows machine or a unix computer, like Linux or Mac. If on Windows, we need to use commands like do for, while it is easier to use mclapply under unix.

Just like in the example above, we loop over observations START:T. We also need to set the number of course we what do use for the calculation.

time1=Sys.time()
res=mclapply(START:T, FUN=tmpfn, mc.cores =8)
time2=Sys.time()
print(time2-time1)
Time difference of 1.575405 mins

This takes about 1/6 of the time the single core calculation above took, not 1/8, as there are overheads with multi core calculations.

mclapply returns a list, with one entry per day, each with five risk calculations.

length(res)
class(res)
res[1]
res[[1]]
3034
'list'
    1. 42.593
    2. 69.482
    3. 20.529
    4. 20.513
    5. 22.12
  1. 42.593
  2. 69.482
  3. 20.529
  4. 20.513
  5. 22.12

We can convert the list into a long vector, using unlist, where the first five elements are the first five risk estimates, etc., and then converting that into a matrix with five columns, and one row a day. After converting the matrix into a dataframe, we can then name the columns.

res.ul=unlist(res)
length(res.ul)
length(res[[1]])
res.m=matrix(unlist(res),ncol=length(res[[1]]),byrow=TRUE)
res.df=as.data.frame(res.m)
dim(res.df)
names(res.df)=c("VaR.HS","VaR.HS500","VaR.EWMA","VaR.GARCH","VaR.tGARCH")
head(res.df,2)
15170
5
  1. 3034
  2. 5
A data.frame: 2 × 5
VaR.HS VaR.HS500 VaR.EWMA VaR.GARCH VaR.tGARCH
<dbl> <dbl> <dbl> <dbl> <dbl>
1 42.593 69.482 20.529 20.513 22.120
2 42.593 69.482 20.194 19.821 21.376

The first risk estimate is for day START, and we want to create a data frame with the same number of rows as the data, and we do that by filling it with rows containing only NA.

df = t(data.frame(rep(NA,length(res[[1]])))) 
df=as.data.frame(df[rep(seq_len(nrow(df)), each = START-1), ])
class(df)
names(df)=names(res.df)
res.df.full=rbind(df,res.df)
rownames(res.df.full)=NULL
print(rbind(head(res.df.full,1),tail(res.df.full,1)))
dim(res.df.full)
dim(sp500)
'data.frame'
     VaR.HS VaR.HS500 VaR.EWMA VaR.GARCH VaR.tGARCH
1        NA        NA       NA        NA         NA
5034 35.758    53.222   30.535    29.433     33.376
  1. 5034
  2. 5
  1. 5034
  2. 11

We then want to make a matrix with returns and the risk estimates, one that looks the same as what we did above. In this case, we do that by copying the first six columns of the sp500 into a new dataframe called sp500.parallel, and using cbind to append the risk columns to it.

sp500.parallel=sp500[,c("date","price","y","date.ts","y.ts","price.ts")]
sp500.parallel=cbind(sp500.parallel,res.df.full)
tail(sp500.parallel)
dim(sp500.parallel)
dim(sp500)
A data.frame: 6 × 11
date price y date.ts y.ts price.ts VaR.HS VaR.HS500 VaR.EWMA VaR.GARCH VaR.tGARCH
<int> <dbl> <dbl> <date> <zoo> <zoo> <dbl> <dbl> <dbl> <dbl> <dbl>
5030 20221222 3822.39 -0.014557130 2022-12-22 -0.014557130 3822.39 35.758 53.222 32.131 29.006 33.374
5031 20221223 3844.82 0.005850959 2022-12-23 0.005850959 3844.82 35.758 53.222 32.267 30.275 34.545
5032 20221227 3829.25 -0.004057852 2022-12-27 -0.004057852 3829.25 35.758 53.222 32.365 27.608 32.017
5033 20221228 3783.22 -0.012093463 2022-12-28 -0.012093463 3783.22 35.758 53.222 31.556 25.106 29.559
5034 20221229 3849.28 0.017310619 2022-12-29 0.017310619 3849.28 35.758 53.222 30.682 26.042 30.241
5035 20221230 3839.50 -0.002543968 2022-12-30 -0.002543968 3839.50 35.758 53.222 30.535 29.433 33.376
  1. 5034
  2. 11
  1. 5034
  2. 11

As a final assurance, we test to see if the two dataframes are identical.

identical(sp500,sp500.parallel)
TRUE

15.7 Exercise

Conduct Bernoulli Coverage tests and Independence tests for the above backtesting procedure. Interpret the results.

# Answer
p.mat=matrix(NA,5,WT)
for (i in 1:5){
  tmp=sp500$y.ts < -(sp500[,i+6]/value)
  tmp=tmp[!is.na(tmp)]
  p.mat[i,]=tmp
  #eta[i]=sum(tmp)
}
p.mat=1*p.mat
rowSums(p.mat)

bern_test = function(p,v){
  a=p^v*(1 - p)^(WT-v)
  b = (v/WT)^v*(1 - v/WT)^(WT-v)
  return(-2*log(a/b))
}
bern.rst=bern_test(p,rowSums(p.mat))
bern.rst>qchisq(1-p, df=1)

ind_test = function(V){
  J = matrix(ncol = 4,nrow = length(V))
  for (i in 2:length(V)){
    J[i,1] = V[i - 1] == 0 & V[i] == 0
    J[i,2] = V[i - 1] ==0 & V[i] == 1
    J[i,3] = V[i - 1] == 1 & V[i] == 0
    J[i,4] = V[i - 1] == 1 & V[i] == 1
  }  
  V_00 = sum(J[,1],na.rm = TRUE)
  V_01 = sum(J[,2],na.rm = TRUE)
  V_10 = sum(J[,3],na.rm = TRUE)
  V_11 = sum(J[,4],na.rm = TRUE)
  p_00 = V_00/(V_00 + V_01)
  p_01 = V_01/(V_00 + V_01)
  p_10 = V_10/(V_10 + V_11)
  p_11 = V_11/(V_10 + V_11)
  hat_p = (V_01 + V_11)/(V_00 + V_01 + V_10 + V_11)
  a = (1 - hat_p)^(V_00 + V_10)*(hat_p)^(V_01 + V_11)
  b = (p_00)^(V_00)*(p_01)^(V_01)*(p_10)^(V_10)* p_11^(V_11)
  return(-2 * log(a/b))
}

ind.rst=apply(p.mat,1,ind_test)
ind.rst>qchisq(1-p, df=1)

# Coverage test: reject null (p matches the observed number of violations) for HS500, EWMA, GARCH.
# Independence test: fail to reject null (two violations do not follow each other) for models.
  1. 35
  2. 9
  3. 77
  4. 69
  5. 44
  1. FALSE
  2. TRUE
  3. TRUE
  4. TRUE
  5. FALSE
  1. FALSE
  2. FALSE
  3. FALSE
  4. FALSE
  5. FALSE