-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathregression.Rmd
More file actions
179 lines (139 loc) · 3.85 KB
/
regression.Rmd
File metadata and controls
179 lines (139 loc) · 3.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
---
title: "Regression"
output: html_document
date: "2024-07-20"
---
```{r}
# imports
require(ggplot2)
require(zoo)
require(reshape2)
require(fitdistrplus)
require(lmtest)
```
## Load and pre-process data
Assumption: the three csv files were downloaded into data/, i.e. these files exist
data/DGS5.csv
data/DGS10.csv
data/T10YIE.csv
```{r}
# read data, set types, encode missings
data_path <- "data/"
inflat <- read.csv(
file=paste0(data_path, "T10YIE.csv"),
colClasses = c("DATE"="Date", "T10YIE"="numeric"),
na.strings = c(".", "", "NA")
)
yield5 <- read.csv(
file=paste0(data_path, "DGS5.csv"),
colClasses = c("DATE"="Date", "DGS5"="numeric"),
na.strings = c(".", "", "NA")
)
yield10 <- read.csv(
file=paste0(data_path, "DGS10.csv"),
colClasses = c("DATE"="Date", "DGS10"="numeric"),
na.strings = c(".", "", "NA")
)
```
## explore basic location and dispersion statistics
```{r}
summary(inflat)
```
```{r}
summary(yield5)
```
```{r}
summary(yield10)
```
## fill missings by carrying last observation forward
```{r}
inflat <- na.locf(inflat)
yield5 <- na.locf(yield5)
yield10 <- na.locf(yield10)
stopifnot(sum(is.na(inflat))==0)
stopifnot(sum(is.na(yield5))==0)
stopifnot(sum(is.na(yield10))==0)
```
## check whether all have same dates
inflat has one day more, 2024-07-19 (we can omit this date to align date axis)
```{r}
df.list <- list(inflat=inflat, yield5=yield5, yield10=yield10)
df.pairs <- combn(c("inflat", "yield5", "yield10"), 2)
for (i in 1:dim(df.pairs)[2]){
name1 <- df.pairs[1,i]
name2 <- df.pairs[2,i]
df1 <- df.list[[name1]]
df2 <- df.list[[name2]]
date_setdiff <- setdiff(unique(df1$DATE), unique(df2$DATE))
if(length(date_setdiff)>0){
print(paste("dates set diff between", name1, "-", name2, ":", as.Date(date_setdiff)))
}
date_setdiff_back <- setdiff(unique(df2$DATE), unique(df1$DATE))
if(length(date_setdiff_back)>0){
print(paste("dates set diff between", name2, "-", name1, ":", as.Date(date_setdiff_back)))
}
}
```
## put all in one dataframe to align dates
```{r}
df <- merge(merge(inflat, yield5), yield10)
stopifnot(length(unique(df$DATE))==length(unique(yield5$DATE)))
stopifnot(sum(is.na(df))==0)
```
```{r}
# save dataframe
save(df, file="dataframes/reg.Rda")
```
```{r}
# plot all time series at once
df.melt <- melt(df, id="DATE", measure.vars=c("T10YIE", "DGS5", "DGS10"), variable.name="TICKER", value.name="RATE")
ggplot(df.melt, aes(x=DATE, y=RATE, color=TICKER)) + geom_line()
```
## evaluate assumptions of linear regression (a priori sanity checks)
assumption: the response is DGS10 which we want to regress on the explanatory variable T10YIE
linearity: from the plot it is doubtful whether the relationship between x=T10YIE and E[DGS10] is linear
```{r}
ggplot(df, aes(x=T10YIE, y=DGS10)) + geom_point()
```
independence violated: observations of dependent and independent variable are highly autocorrelated
```{r}
acf(df$T10YIE)
acf(df$DGS10)
```
normality of response: Shapiro test rejects the H0 of normality
```{r}
shapiro.test(df$DGS10) # p-value < 2.2e-16
```
normal distribution of response
- deviations from normal distribution, especially in tails
- don't need this assumption for prediction
```{r}
fit.norm <- fitdist(df$DGS10, distr="norm")
plot(fit.norm)
```
## regress 10y yield (y) against inflation (x)
```{r}
fit <- lm(DGS10 ~ 1 + T10YIE, data=df)
summary(fit)
```
```{r}
ggplot(df, aes(x=T10YIE, y=DGS10)) +
geom_point() +
geom_line(aes(x=T10YIE, y=fit$fitted.values), color="red")
```
# diagnose residual
hist shows heavy tailed residuals
```{r}
hist(fit$residuals)
```
Tuckey-Anscombe plot:
- assumptions of zero mean and homoscedasticity are violated!
- shape of mean curve could suggest a missing cubic term
(https://www.londschien.ch/teaching/stat-modelling/2022-11-14)
```{r}
plot(fit, which=1)
```
Q-Q plot shows deviation from normal in tails
```{r}
plot(fit, which=2)
```