Skip to content

Latest commit

 

History

History
1405 lines (1247 loc) · 61.6 KB

File metadata and controls

1405 lines (1247 loc) · 61.6 KB

edit-5.0

GitHub Documents

РАЗВЕДОЧНЫЙ АНАЛИЗ ДАННЫХ

diplom_1802_imp = read.csv('/Users/olesyamba/Downloads/data/diplom_1802_imp.csv')
diplom_1802_w_imp = read.csv('/Users/olesyamba/Downloads/data/diplom_1802_w_imp.csv')
diplom_1802_imp = filter(diplom_1802_imp, year != "2016")
desc = stat.desc(diplom_1802_imp[, c(6, 7,10:20)])
desc = t(round(desc, digits = 2))
desc = desc[, c('nbr.val','min', 'max', 'mean', 'median', 'std.dev')]
desc
##                         nbr.val      min    max   mean median std.dev
## price_to_book               612     0.04  11.17   3.02   2.67    1.43
## polarity_news               612     0.03   0.14   0.09   0.09    0.01
## polarity_twitter            612     0.01   0.30   0.12   0.12    0.02
## yoy_revenue_growth          612   -11.55 261.18  19.02  16.64   27.85
## google_trends               612 -1021.73 683.00 167.66 164.36  102.03
## number_of_news              612     0.00  32.15   2.15   1.28    3.26
## number_of_likes_twitter     612     0.02 220.08  15.42  14.91   12.68
## number_of_tweets            612     0.28 487.69  76.10  72.89   38.56
## sales                       612  -189.12 103.60  25.30  26.19   12.00
## ROA                         612   -22.57  38.05  14.15  13.96    6.65
## fin_leverage                612  -105.15  43.95   0.56   0.45    6.60
## buyback_yield               612   -24.97  37.22   2.38   2.29    3.92
## log_sales                   607     1.82   4.65   3.25   3.30    0.30

На графике представлены описательные статистики до предварительной обработки, включающей обработку пропущенных значений, обработку выбросов, а также уменьшение дисперсии некоторых переменных с целью борьбы с шумом.

diplom_1802_w_imp = filter(diplom_1802_w_imp, year != "2016")
desc = stat.desc(diplom_1802_w_imp[, c(6, 7,10:20)])
desc = t(round(desc, digits = 2))
desc = desc[, c('nbr.val','min', 'max', 'mean', 'median', 'std.dev')]
desc
##                         nbr.val    min    max   mean median std.dev
## price_to_book               612   0.04  11.17   3.02   2.67    1.43
## polarity_news               612   0.03   0.14   0.09   0.09    0.01
## polarity_twitter            612   0.01   0.30   0.12   0.12    0.02
## yoy_revenue_growth          612 -11.55  47.14  16.73  16.64   12.82
## google_trends               612  10.00 683.00 169.48 164.36   89.90
## number_of_news              612   0.00  32.15   2.20   1.38    3.28
## number_of_likes_twitter     612   0.02 220.08  15.42  14.91   12.68
## number_of_tweets            612   0.28 487.69  76.10  72.89   38.56
## sales                       612   7.80 103.60  25.76  26.22    8.02
## ROA                         612  -1.22  38.05  14.24  13.96    6.33
## fin_leverage                612  -5.96  18.82   1.03   0.45    3.15
## buyback_yield               612  -4.15  17.90   2.51   2.35    3.50
## log_sales                   612   2.17   4.65   3.24   3.30    0.31

КОРРЕЛЯЦИОННЫЙ АНАЛИЗ

library("Hmisc")
## Loading required package: lattice

## 
## Attaching package: 'lattice'

## The following object is masked from 'package:regclass':
## 
##     qq

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following object is masked from 'package:parsnip':
## 
##     translate

## The following object is masked from 'package:simputation':
## 
##     impute

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units
res2 <- rcorr(as.matrix(diplom_1802_w_imp[, c(6:20)]))
matrix_r = as.data.frame(res2$r)
matrix_p = as.data.frame(res2$P)

empty_as_na <- function(x){
  ifelse(is.na(x), as.integer('1'), x)
}
matrix_r = round(matrix_r, 3)
matrix_p = matrix_p %>% mutate_each(funs(empty_as_na)) 
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
## 
## # Simple named list: list(mean = mean, median = median)
## 
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
## 
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))

## Warning: `mutate_each_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `across()` instead.
## ℹ The deprecated feature was likely used in the dplyr package.
##   Please report the issue at <�]8;;https://github.com/tidyverse/dplyr/issues�https://github.com/tidyverse/dplyr/issues�]8;;�>.
for (i in seq.int(1,15,1)){
  for (j in seq.int(1,15,1)){
    if (matrix_p[i,j] <= 0.01){
      matrix_r[i,j] = str_c(as.character(matrix_r[i,j]), as.character('*'))
    }
    if (matrix_p[i,j] <= 0.05){
      matrix_r[i,j] = str_c(as.character(matrix_r[i,j]), as.character('*'))
    }
    if (matrix_p[i,j] <= 0.1){
      matrix_r[i,j] = str_c(as.character(matrix_r[i,j]), as.character('*'))
    }
  }
}
matrix_r
##                         price_to_book polarity_news subjectivity_news
## price_to_book                       1      0.446***          0.485***
## polarity_news                0.446***             1          0.784***
## subjectivity_news            0.485***      0.784***                 1
## pol_sub_news                 0.361***      0.728***          0.726***
## polarity_twitter             0.193***      0.353***          0.432***
## yoy_revenue_growth             0.076*     -0.315***          -0.38***
## google_trends                   -0.04        -0.054         -0.122***
## number_of_news               -0.16***         0.066            0.069*
## number_of_likes_twitter     -0.158***         -0.02             0.003
## number_of_tweets            -0.222***     -0.127***         -0.122***
## sales                          -0.028      0.132***          0.141***
## ROA                          0.685***       0.25***          0.382***
## fin_leverage                -0.109***        -0.033             0.009
## buyback_yield               -0.196***     -0.158***            -0.057
## log_sales                      -0.015      0.143***           0.19***
##                         pol_sub_news polarity_twitter yoy_revenue_growth
## price_to_book               0.361***         0.193***             0.076*
## polarity_news               0.728***         0.353***          -0.315***
## subjectivity_news           0.726***         0.432***           -0.38***
## pol_sub_news                       1         0.351***          -0.368***
## polarity_twitter            0.351***                1          -0.336***
## yoy_revenue_growth         -0.368***        -0.336***                  1
## google_trends              -0.107***         -0.19***           0.129***
## number_of_news                0.074*          0.091**          -0.119***
## number_of_likes_twitter         0.01            0.035             -0.037
## number_of_tweets           -0.116***            -0.01            0.094**
## sales                       0.116***            0.052          -0.195***
## ROA                         0.269***         0.261***          -0.279***
## fin_leverage                    0.03           -0.041              0.045
## buyback_yield                  -0.06           -0.024              0.006
## log_sales                    0.14***          0.096**           -0.23***
##                         google_trends number_of_news number_of_likes_twitter
## price_to_book                   -0.04       -0.16***               -0.158***
## polarity_news                  -0.054          0.066                   -0.02
## subjectivity_news           -0.122***         0.069*                   0.003
## pol_sub_news                -0.107***         0.074*                    0.01
## polarity_twitter             -0.19***        0.091**                   0.035
## yoy_revenue_growth           0.129***      -0.119***                  -0.037
## google_trends                       1       0.258***                 0.085**
## number_of_news               0.258***              1                0.478***
## number_of_likes_twitter       0.085**       0.478***                       1
## number_of_tweets             0.153***       0.549***                0.739***
## sales                        0.304***       0.336***                   0.041
## ROA                         -0.145***         -0.039                  -0.1**
## fin_leverage                 -0.084**         -0.029                   0.015
## buyback_yield                 -0.075*       0.141***                   0.028
## log_sales                    0.282***       0.266***                   0.038
##                         number_of_tweets     sales       ROA fin_leverage
## price_to_book                  -0.222***    -0.028  0.685***    -0.109***
## polarity_news                  -0.127***  0.132***   0.25***       -0.033
## subjectivity_news              -0.122***  0.141***  0.382***        0.009
## pol_sub_news                   -0.116***  0.116***  0.269***         0.03
## polarity_twitter                   -0.01     0.052  0.261***       -0.041
## yoy_revenue_growth               0.094** -0.195*** -0.279***        0.045
## google_trends                   0.153***  0.304*** -0.145***     -0.084**
## number_of_news                  0.549***  0.336***    -0.039       -0.029
## number_of_likes_twitter         0.739***     0.041    -0.1**        0.015
## number_of_tweets                       1     0.021 -0.154***       -0.013
## sales                              0.021         1     0.052    -0.128***
## ROA                            -0.154***     0.052         1       -0.07*
## fin_leverage                      -0.013 -0.128***    -0.07*            1
## buyback_yield                      0.047  0.369***    -0.032     0.388***
## log_sales                         -0.003  0.923***   0.087**    -0.127***
##                         buyback_yield log_sales
## price_to_book               -0.196***    -0.015
## polarity_news               -0.158***  0.143***
## subjectivity_news              -0.057   0.19***
## pol_sub_news                    -0.06   0.14***
## polarity_twitter               -0.024   0.096**
## yoy_revenue_growth              0.006  -0.23***
## google_trends                 -0.075*  0.282***
## number_of_news               0.141***  0.266***
## number_of_likes_twitter         0.028     0.038
## number_of_tweets                0.047    -0.003
## sales                        0.369***  0.923***
## ROA                            -0.032   0.087**
## fin_leverage                 0.388*** -0.127***
## buyback_yield                       1  0.349***
## log_sales                    0.349***         1

Цикл позволяет вывести корреляционную матрицу с указанием уровней значимости * = 10%, ** = 5%,*** = 1% соответственно. Борьба с мультиколлинеарностью проводится далее. На данном этапе лишь подтверждается ее необходимость.

МОДЕЛИРОВАНИЕ

dataPanel <- pdata.frame(diplom_1802_w_imp, index=c("ticker","year"))# трансформация в панельные данные
filter(dataPanel, year == "2017") %>%
  group_by(sector)%>%
  count(sector) # выводим представленный в выборке набор секторов
## # A tibble: 7 × 2
## # Groups:   sector [7]
##   sector                     n
##   <chr>                  <int>
## 1 Communication Services    13
## 2 Consumer Discretionary    15
## 3 Consumer Staples           7
## 4 Health Care               13
## 5 Industrials                8
## 6 Information Technology    42
## 7 Utilities                  4

МОДЕЛЬ МНОЖЕСТВЕННОЙ ЛИНЕЙНОЙ РЕГРЕССИИ

dataPanel_std <- pdata.frame(data.frame(diplom_1802_w_imp[,c(1:5)], scale(diplom_1802_w_imp[,-c(1:5)])), index=c("ticker","year")) #дополнительно создаем дф с стандартнизированными переменными
ols1 <-plm(price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  google_trends + number_of_news + number_of_tweets + sales +  ROA + fin_leverage + buyback_yield , data = filter(dataPanel_std, year != "2016"), model="within")
summary(ols1) # оцениваем первую спецификацию как обычную множественную линейную регрессию со стандартизированными коэффициентами 
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = price_to_book ~ polarity_news + polarity_twitter + 
##     yoy_revenue_growth + google_trends + number_of_news + number_of_tweets + 
##     sales + ROA + fin_leverage + buyback_yield, data = filter(dataPanel_std, 
##     year != "2016"), model = "within")
## 
## Balanced Panel: n = 102, T = 6, N = 612
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -1.903461 -0.172692 -0.021048  0.172409  1.617357 
## 
## Coefficients:
##                      Estimate Std. Error t-value  Pr(>|t|)    
## polarity_news       0.1975949  0.0280822  7.0363 6.535e-12 ***
## polarity_twitter    0.0070015  0.0238181  0.2940  0.768913    
## yoy_revenue_growth  0.1687012  0.1014829  1.6624  0.097067 .  
## google_trends       0.0666675  0.0237914  2.8022  0.005273 ** 
## number_of_news      0.0220853  0.0313411  0.7047  0.481339    
## number_of_tweets    0.0603640  0.0362501  1.6652  0.096497 .  
## sales              -0.0543929  0.0243196 -2.2366  0.025754 *  
## ROA                 0.6609974  0.0258889 25.5321 < 2.2e-16 ***
## fin_leverage       -0.0521373  0.0255184 -2.0431  0.041563 *  
## buyback_yield      -0.0145474  0.0289439 -0.5026  0.615461    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    208.97
## Residual Sum of Squares: 80.697
## R-Squared:      0.61384
## Adj. R-Squared: 0.52811
## F-statistic: 79.4792 on 10 and 500 DF, p-value: < 2.22e-16
ols2 <-plm(price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  google_trends + number_of_news + number_of_likes_twitter + sales +  ROA + fin_leverage + buyback_yield, data = filter(dataPanel_std, year != "2016"), model="within")
summary(ols2)# оцениваем вторую спецификацию как обычную множественную линейную регрессию со стандартизированными коэффициентами
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = price_to_book ~ polarity_news + polarity_twitter + 
##     yoy_revenue_growth + google_trends + number_of_news + number_of_likes_twitter + 
##     sales + ROA + fin_leverage + buyback_yield, data = filter(dataPanel_std, 
##     year != "2016"), model = "within")
## 
## Balanced Panel: n = 102, T = 6, N = 612
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -1.861411 -0.172475 -0.018288  0.159766  1.607813 
## 
## Coefficients:
##                           Estimate Std. Error t-value  Pr(>|t|)    
## polarity_news            0.2111822  0.0275253  7.6723 8.910e-14 ***
## polarity_twitter         0.0027737  0.0231268  0.1199 0.9045831    
## yoy_revenue_growth       0.1741428  0.0989866  1.7593 0.0791455 .  
## google_trends            0.0839150  0.0234737  3.5749 0.0003844 ***
## number_of_news           0.0276843  0.0283873  0.9752 0.3299146    
## number_of_likes_twitter  0.1217805  0.0230780  5.2769 1.961e-07 ***
## sales                   -0.0617157  0.0235184 -2.6241 0.0089523 ** 
## ROA                      0.6754596  0.0253669 26.6276 < 2.2e-16 ***
## fin_leverage            -0.0546116  0.0248894 -2.1942 0.0286822 *  
## buyback_yield           -0.0087167  0.0282713 -0.3083 0.7579650    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    208.97
## Residual Sum of Squares: 76.864
## R-Squared:      0.63218
## Adj. R-Squared: 0.55052
## F-statistic: 85.9362 on 10 and 500 DF, p-value: < 2.22e-16
ols3 <-plm(price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  google_trends + number_of_news + number_of_tweets + sales +  ROA + fin_leverage + buyback_yield, data = filter(dataPanel, year != "2016"), model="within")
summary(ols3) # оцениваем первую спецификацию как обычную множественную линейную регрессию
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = price_to_book ~ polarity_news + polarity_twitter + 
##     yoy_revenue_growth + google_trends + number_of_news + number_of_tweets + 
##     sales + ROA + fin_leverage + buyback_yield, data = filter(dataPanel, 
##     year != "2016"), model = "within")
## 
## Balanced Panel: n = 102, T = 6, N = 612
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -2.726513 -0.247364 -0.030149  0.246958  2.316698 
## 
## Coefficients:
##                       Estimate  Std. Error t-value  Pr(>|t|)    
## polarity_news      25.11107946  3.56879052  7.0363 6.535e-12 ***
## polarity_twitter    0.48001275  1.63293902  0.2940  0.768913    
## yoy_revenue_growth  0.01885132  0.01134010  1.6624  0.097067 .  
## google_trends       0.00106223  0.00037908  2.8022  0.005273 ** 
## number_of_news      0.00965490  0.01370117  0.7047  0.481339    
## number_of_tweets    0.00224214  0.00134646  1.6652  0.096497 .  
## sales              -0.00971785  0.00434496 -2.2366  0.025754 *  
## ROA                 0.14950853  0.00585571 25.5321 < 2.2e-16 ***
## fin_leverage       -0.02371977  0.01160952 -2.0431  0.041563 *  
## buyback_yield      -0.00594522  0.01182875 -0.5026  0.615461    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    428.76
## Residual Sum of Squares: 165.57
## R-Squared:      0.61384
## Adj. R-Squared: 0.52811
## F-statistic: 79.4792 on 10 and 500 DF, p-value: < 2.22e-16
ols4 <-plm(price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  google_trends + number_of_news + number_of_likes_twitter + sales +  ROA + fin_leverage + buyback_yield, data = filter(dataPanel, year != "2016"), model="within")
summary(ols4)# оцениваем вторую спецификацию как обычную множественную линейную регрессию 
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = price_to_book ~ polarity_news + polarity_twitter + 
##     yoy_revenue_growth + google_trends + number_of_news + number_of_likes_twitter + 
##     sales + ROA + fin_leverage + buyback_yield, data = filter(dataPanel, 
##     year != "2016"), model = "within")
## 
## Balanced Panel: n = 102, T = 6, N = 612
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -2.666281 -0.247053 -0.026195  0.228849  2.303028 
## 
## Coefficients:
##                            Estimate  Std. Error t-value  Pr(>|t|)    
## polarity_news           26.83779857  3.49801536  7.6723 8.910e-14 ***
## polarity_twitter         0.19016168  1.58554481  0.1199 0.9045831    
## yoy_revenue_growth       0.01945939  0.01106114  1.7593 0.0791455 .  
## google_trends            0.00133704  0.00037401  3.5749 0.0003844 ***
## number_of_news           0.01210258  0.01240990  0.9752 0.3299146    
## number_of_likes_twitter  0.01376069  0.00260772  5.2769 1.961e-07 ***
## sales                   -0.01102614  0.00420181 -2.6241 0.0089523 ** 
## ROA                      0.15277968  0.00573763 26.6276 < 2.2e-16 ***
## fin_leverage            -0.02484544  0.01132336 -2.1942 0.0286822 *  
## buyback_yield           -0.00356232  0.01155387 -0.3083 0.7579650    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    428.76
## Residual Sum of Squares: 157.71
## R-Squared:      0.63218
## Adj. R-Squared: 0.55052
## F-statistic: 85.9362 on 10 and 500 DF, p-value: < 2.22e-16

МОДЕЛЬ МНОЖЕСТВЕННОЙ ЛИНЕЙНОЙ РЕГРЕССИИ | ВЫБОР СПЕЦИФИКАЦИИ

ols <-lm(price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  google_trends + number_of_news + number_of_tweets + sales +  ROA + fin_leverage + buyback_yield, data = filter(dataPanel, year != "2016"))
summary(ols)
## 
## Call:
## lm(formula = price_to_book ~ polarity_news + polarity_twitter + 
##     yoy_revenue_growth + google_trends + number_of_news + number_of_tweets + 
##     sales + ROA + fin_leverage + buyback_yield, data = filter(dataPanel, 
##     year != "2016"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5872 -0.3956 -0.0876  0.3342  4.3366 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -4.2611883  0.3583478 -11.891  < 2e-16 ***
## polarity_news      47.2520171  3.2026623  14.754  < 2e-16 ***
## polarity_twitter    1.8313043  1.7020330   1.076  0.28238    
## yoy_revenue_growth  0.0428334  0.0027884  15.361  < 2e-16 ***
## google_trends       0.0009137  0.0004016   2.275  0.02324 *  
## number_of_news     -0.0361348  0.0127386  -2.837  0.00471 ** 
## number_of_tweets   -0.0024498  0.0010205  -2.401  0.01667 *  
## sales               0.0001444  0.0051885   0.028  0.97780    
## ROA                 0.1542498  0.0053542  28.809  < 2e-16 ***
## fin_leverage       -0.0133935  0.0115023  -1.164  0.24471    
## buyback_yield      -0.0357095  0.0116653  -3.061  0.00230 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7735 on 601 degrees of freedom
## Multiple R-squared:  0.7132, Adjusted R-squared:  0.7084 
## F-statistic: 149.5 on 10 and 601 DF,  p-value: < 2.2e-16
regclass::VIF(ols)
##      polarity_news   polarity_twitter yoy_revenue_growth      google_trends 
##           1.330886           1.291543           1.304850           1.331091 
##     number_of_news   number_of_tweets              sales                ROA 
##           1.779327           1.581860           1.767337           1.174227 
##       fin_leverage      buyback_yield 
##           1.339493           1.707357

Мультиколлинеарности в модели после удаления части переменных нет, так как коэффициенты VIF < 5. Дополнительное подтверждение отсутствию необходимости включения регуляризации. Однако в выборке присутствуют компании и года, значительно отличающиеся друг от друга, проверим дополнительные спецификации.

bptest(ols4, data = dataPanel)
## 
##  studentized Breusch-Pagan test
## 
## data:  ols4
## BP = 117.69, df = 10, p-value < 2.2e-16
plm::pbgtest(ols4, data = dataPanel)
## 
##  Breusch-Godfrey/Wooldridge test for serial correlation in panel models
## 
## data:  price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  ...
## chisq = 71.11, df = 6, p-value = 2.42e-13
## alternative hypothesis: serial correlation in idiosyncratic errors

Есть автокорреляция и гетероскедастичность, необходимо использовать скорректированную ковариационную матрицу.

coeftest(ols4, vcovHC(ols4, method = "arellano", type = "HC2"))
## 
## t test of coefficients:
## 
##                            Estimate  Std. Error t value  Pr(>|t|)    
## polarity_news           26.83779857  7.37249982  3.6403 0.0003007 ***
## polarity_twitter         0.19016168  1.91886821  0.0991 0.9210978    
## yoy_revenue_growth       0.01945939  0.01052297  1.8492 0.0650148 .  
## google_trends            0.00133704  0.00064068  2.0869 0.0374018 *  
## number_of_news           0.01210258  0.02130224  0.5681 0.5701972    
## number_of_likes_twitter  0.01376069  0.00479490  2.8699 0.0042804 ** 
## sales                   -0.01102614  0.00530562 -2.0782 0.0382006 *  
## ROA                      0.15277968  0.01074163 14.2231 < 2.2e-16 ***
## fin_leverage            -0.02484544  0.00850044 -2.9228 0.0036257 ** 
## buyback_yield           -0.00356232  0.00979264 -0.3638 0.7161795    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plm::pcdtest(ols4, test = c("cd"))
## 
##  Pesaran CD test for cross-sectional dependence in panels
## 
## data:  price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +     google_trends + number_of_news + number_of_likes_twitter +     sales + ROA + fin_leverage + buyback_yield
## z = -0.5697, p-value = 0.5689
## alternative hypothesis: cross-sectional dependence

no cross-sectional dependence

plm::pcdtest(ols4, test = c("lm"))
## 
##  Breusch-Pagan LM test for cross-sectional dependence in panels
## 
## data:  price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +     google_trends + number_of_news + number_of_likes_twitter +     sales + ROA + fin_leverage + buyback_yield
## chisq = 6560.3, df = 5151, p-value < 2.2e-16
## alternative hypothesis: cross-sectional dependence

cross-sectional dependence Вывод: результаты тестов разнятся, однако даже в случае наличия кросс-секциональной зависимости это небольшая проблема, так как у нас короткий временной интервал и большая выборка компаний.

dwtest(ols)
## 
##  Durbin-Watson test
## 
## data:  ols
## DW = 1.0993, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
fixed <- plm(price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  google_trends + number_of_news + number_of_tweets + sales +  ROA + fin_leverage + buyback_yield, data = filter(dataPanel, year != "2016"), model="within")
summary(fixed)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = price_to_book ~ polarity_news + polarity_twitter + 
##     yoy_revenue_growth + google_trends + number_of_news + number_of_tweets + 
##     sales + ROA + fin_leverage + buyback_yield, data = filter(dataPanel, 
##     year != "2016"), model = "within")
## 
## Balanced Panel: n = 102, T = 6, N = 612
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -2.726513 -0.247364 -0.030149  0.246958  2.316698 
## 
## Coefficients:
##                       Estimate  Std. Error t-value  Pr(>|t|)    
## polarity_news      25.11107946  3.56879052  7.0363 6.535e-12 ***
## polarity_twitter    0.48001275  1.63293902  0.2940  0.768913    
## yoy_revenue_growth  0.01885132  0.01134010  1.6624  0.097067 .  
## google_trends       0.00106223  0.00037908  2.8022  0.005273 ** 
## number_of_news      0.00965490  0.01370117  0.7047  0.481339    
## number_of_tweets    0.00224214  0.00134646  1.6652  0.096497 .  
## sales              -0.00971785  0.00434496 -2.2366  0.025754 *  
## ROA                 0.14950853  0.00585571 25.5321 < 2.2e-16 ***
## fin_leverage       -0.02371977  0.01160952 -2.0431  0.041563 *  
## buyback_yield      -0.00594522  0.01182875 -0.5026  0.615461    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    428.76
## Residual Sum of Squares: 165.57
## R-Squared:      0.61384
## Adj. R-Squared: 0.52811
## F-statistic: 79.4792 on 10 and 500 DF, p-value: < 2.22e-16

Фиксированные эффекты = константы для каждой компании:

fixef(fixed)
##      AAPL      ABNB      ADBE       ADI       ADP      ADSK       AEP      ALGN 
## -5.268724 -1.091923 -0.060543 -2.979120 -1.645704 -0.895133 -1.607038 -1.266904 
##      AMAT       AMD      AMGN      AMZN      ANSS      ASML      ATVI      AVGO 
## -3.188847 -1.965730 -2.417254 -0.540221 -1.584121 -1.364192 -1.833405 -1.468335 
##       AZN      BIDU      BIIB      BKNG      CDNS       CEG      CHTR     CMCSA 
## -1.578474 -1.750082 -2.571405 -2.171583 -1.806305 -1.410195 -1.947064 -2.910469 
##      COST      CPRT      CRWD      CSCO       CSX      CTAS      CTSH      DDOG 
## -2.488231 -1.200323 -1.636930 -2.711529 -2.488288 -1.798977 -2.166980 -1.546219 
##      DLTR      DOCU      DXCM        EA      EBAY       EXC      FAST      FISV 
## -2.104780 -1.724635 -1.692139 -1.924382 -2.194261 -2.431851 -2.245624 -2.126222 
##      FTNT      GILD      GOOG     GOOGL       HON      IDXX      ILMN      INTC 
## -1.744955 -2.105113 -1.974287 -1.895555 -2.403201 -1.704043 -1.415760 -3.245168 
##      INTU      ISRG        JD       KDP       KHC      KLAC      LCID      LRCX 
##  0.301209 -1.157126 -3.115394 -2.097613 -2.180186 -2.170309 -1.021640 -2.232596 
##      LULU       MAR      MCHP      MDLZ      MELI      META      MNST      MRNA 
## -2.190834 -2.034622 -1.618909 -2.464274 -1.823332 -1.236988 -1.848202 -1.457964 
##      MRVL      MSFT      MTCH        MU      NFLX      NTES      NVDA      NXPI 
## -1.637366 -2.262962 -1.970139 -1.610258 -0.130539 -1.848994  0.938387 -2.187915 
##      ODFL      OKTA      ORLY      PANW      PAYX      PCAR       PDD       PEP 
## -1.667947 -1.788360 -2.294968 -1.904230 -1.789004 -2.418683 -1.699515 -2.224405 
##      PYPL      QCOM      REGN      ROST      SBUX      SGEN      SIRI      SNPS 
## -1.710553 -2.543108 -2.125301 -1.949423 -2.435735 -1.672976 -2.049722 -1.405538 
##      SPLK      SWKS      TEAM      TMUS      TSLA       TXN      VRSK      VRSN 
## -2.028079 -2.163760 -1.843044 -2.184615 -0.903707 -2.073659 -2.307856 -2.221563 
##      VRTX       WBA      WDAY       XEL        ZM        ZS 
## -2.158579 -2.331539 -1.634870 -1.886763 -1.307891 -1.736754

Сравним спецификации с учетом фиксированных эффектов и без:

pFtest(fixed, ols)
## 
##  F test for individual effects
## 
## data:  price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  ...
## F = 5.7995, df1 = 101, df2 = 500, p-value < 2.2e-16
## alternative hypothesis: significant effects

Нулевая гипотеза: спецификация МНК лучше спецификации с фиксированными эффектами p-value < 2.2e-16 < 0.01, следовательно, на уровне значимости 1% нулевая гипотеза отклоняется, фиксированные эффекты значимы.

Оценим спецификацию со случайными эффектами:

random <- plm(price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  google_trends + number_of_news + number_of_tweets + sales +  ROA + fin_leverage + buyback_yield, data = filter(dataPanel, year != "2016"), model="random")
summary(random)
## Oneway (individual) effect Random Effect Model 
##    (Swamy-Arora's transformation)
## 
## Call:
## plm(formula = price_to_book ~ polarity_news + polarity_twitter + 
##     yoy_revenue_growth + google_trends + number_of_news + number_of_tweets + 
##     sales + ROA + fin_leverage + buyback_yield, data = filter(dataPanel, 
##     year != "2016"), model = "random")
## 
## Balanced Panel: n = 102, T = 6, N = 612
## 
## Effects:
##                  var std.dev share
## idiosyncratic 0.3311  0.5755 0.632
## individual    0.1926  0.4388 0.368
## theta: 0.528
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -2.610474 -0.308718 -0.057921  0.260679  3.231782 
## 
## Coefficients:
##                       Estimate  Std. Error z-value  Pr(>|z|)    
## (Intercept)        -2.82509910  0.37697534 -7.4941 6.674e-14 ***
## polarity_news      34.48548368  3.31286816 10.4096 < 2.2e-16 ***
## polarity_twitter    0.35801533  1.59610592  0.2243  0.822520    
## yoy_revenue_growth  0.03597278  0.00405744  8.8659 < 2.2e-16 ***
## google_trends       0.00106144  0.00037318  2.8443  0.004451 ** 
## number_of_news     -0.00967462  0.01294532 -0.7473  0.454855    
## number_of_tweets   -0.00072379  0.00114784 -0.6306  0.528323    
## sales              -0.00562326  0.00442311 -1.2713  0.203609    
## ROA                 0.15024663  0.00546816 27.4766 < 2.2e-16 ***
## fin_leverage       -0.02323962  0.01087148 -2.1377  0.032544 *  
## buyback_yield      -0.02162316  0.01118163 -1.9338  0.053136 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    612.51
## Residual Sum of Squares: 221.69
## R-Squared:      0.63807
## Adj. R-Squared: 0.63205
## Chisq: 1059.54 on 10 DF, p-value: < 2.22e-16

Чтобы выбрать между фиксированными или случайными эффектами, проведем тест Хаусмана, где нулевая гипотеза состоит в том, что предпочтительная модель — это случайные эффекты, а альтернативная — фиксированные эффекты (см. Green, 2008, глава 9). По сути, он проверяет, коррелируют ли уникальные ошибки с регрессорами, но нулевая гипотеза заключается в том, что это не так. Если значение p значимо (например, <0,05), необходимо использовать фиксированные эффекты, если нет, - случайные эффекты.

phtest(fixed, random)
## 
##  Hausman Test
## 
## data:  price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  ...
## chisq = 120.89, df = 10, p-value < 2.2e-16
## alternative hypothesis: one model is inconsistent

В нашем случае p-value < 2.2e-16, на 1% уровне значимости нулевая гипотеза отвергается, следовательно, релевантнее модель с фиксированными эффектами. Также необходимо проверить наличие значимых временных эффектов.

fixed.time <- plm(price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  google_trends + number_of_news + number_of_tweets + sales +  ROA + fin_leverage + buyback_yield + factor(year), data = filter(dataPanel, year != "2016"), model="within")
summary(fixed.time)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = price_to_book ~ polarity_news + polarity_twitter + 
##     yoy_revenue_growth + google_trends + number_of_news + number_of_tweets + 
##     sales + ROA + fin_leverage + buyback_yield + factor(year), 
##     data = filter(dataPanel, year != "2016"), model = "within")
## 
## Balanced Panel: n = 102, T = 6, N = 612
## 
## Residuals:
##     Min.  1st Qu.   Median  3rd Qu.     Max. 
## -2.75165 -0.25545 -0.02850  0.24421  2.33601 
## 
## Coefficients:
##                       Estimate  Std. Error t-value  Pr(>|t|)    
## polarity_news      24.92529548  3.67057085  6.7906  3.21e-11 ***
## polarity_twitter    0.42532670  1.64773139  0.2581  0.796415    
## yoy_revenue_growth  0.01886452  0.01147667  1.6437  0.100867    
## google_trends       0.00101128  0.00038639  2.6173  0.009135 ** 
## number_of_news      0.00980988  0.01386546  0.7075  0.479586    
## number_of_tweets    0.00218942  0.00136210  1.6074  0.108609    
## sales              -0.01048957  0.00458596 -2.2873  0.022598 *  
## ROA                 0.14932862  0.00600220 24.8790 < 2.2e-16 ***
## fin_leverage       -0.02403491  0.01168224 -2.0574  0.040172 *  
## buyback_yield      -0.00612786  0.01196459 -0.5122  0.608763    
## factor(year)2018    0.04002950  0.08368190  0.4784  0.632610    
## factor(year)2019   -0.02957576  0.09070176 -0.3261  0.744504    
## factor(year)2020    0.00763063  0.08454971  0.0903  0.928125    
## factor(year)2021    0.02453276  0.08370993  0.2931  0.769592    
## factor(year)2022   -0.01166094  0.08362002 -0.1395  0.889150    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    428.76
## Residual Sum of Squares: 165.29
## R-Squared:      0.61449
## Adj. R-Squared: 0.52415
## F-statistic: 52.6018 on 15 and 495 DF, p-value: < 2.22e-16
pFtest(fixed.time, fixed)
## 
##  F test for individual effects
## 
## data:  price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  ...
## F = 0.16857, df1 = 5, df2 = 495, p-value = 0.9741
## alternative hypothesis: significant effects

Нулевая гипотеза: проверяемый эффект не значим или равен 0. p-value = 0.9741, следовательно, на уровне значимости 1% нулевая гипотеза на исследуемой выборке не отвергается, временные эффекты не значимы.

plmtest(fixed, c("time"), type=("bp"))
## 
##  Lagrange Multiplier Test - time effects (Breusch-Pagan)
## 
## data:  price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  ...
## chisq = 1.076, df = 1, p-value = 0.2996
## alternative hypothesis: significant effects

Результаты теста Бреуша-Пагана аналогичны, наилучшей является спецификация с фиксированными эффектами.

datapanel_train = filter(dataPanel[,-c(8, 9, 20)], year != "2016" & year !="2017")

МОДЕЛЬ БУСТИНГ СЛУЧАЙНЫХ ЛЕСОВ

#install.packages('xgboost')
library(xgboost)
## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice
# Create the specification with placeholders
boost_spec <- boost_tree(
  trees = 1000,
  learn_rate = tune(),
  tree_depth = tune()) %>%
  set_mode("regression") %>%
  set_engine("xgboost")

# Create the tuning grid
tunegrid_boost <- grid_regular(parameters(boost_spec), 
                               levels = 5)
## Warning: `parameters.model_spec()` was deprecated in tune 0.1.6.9003.
## ℹ Please use `hardhat::extract_parameter_set_dials()` instead.
tunegrid_boost
## # A tibble: 25 × 2
##    tree_depth learn_rate
##         <int>      <dbl>
##  1          1    0.001  
##  2          4    0.001  
##  3          8    0.001  
##  4         11    0.001  
##  5         15    0.001  
##  6          1    0.00422
##  7          4    0.00422
##  8          8    0.00422
##  9         11    0.00422
## 10         15    0.00422
## # … with 15 more rows
# Create CV folds of training data
folds <- vfold_cv(datapanel_train[,-c(1:5)], v = 6)

# Tune along the grid
tune_results <- tune_grid(boost_spec,
                          price_to_book ~ .,
                          resamples = folds,
                          grid = tunegrid_boost,
                          metrics = metric_set(rmse, mae, rsq))

# Plot the results
autoplot(tune_results)

# Select the final hyperparameters
best_params <- select_best(tune_results)
## Warning: No value of `metric` was given; metric 'rmse' will be used.
# Finalize the specification
final_spec <- finalize_model(boost_spec, best_params)

# Train the final model on the full training data
final_model <- final_spec %>% fit(formula = price_to_book ~ ., 
                                  data = datapanel_train[,-c(1:5)])

final_model
## parsnip model object
## 
## ##### xgb.Booster
## raw: 1.5 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = 0.0177827941003892, max_depth = 4L, 
##     gamma = 0, colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, 
##     subsample = 1), data = x$data, nrounds = 1000, watchlist = x$watchlist, 
##     verbose = 0, nthread = 1, objective = "reg:squarederror")
## params (as set within xgb.train):
##   eta = "0.0177827941003892", max_depth = "4", gamma = "0", colsample_bytree = "1", colsample_bynode = "1", min_child_weight = "1", subsample = "1", nthread = "1", objective = "reg:squarederror", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## callbacks:
##   cb.evaluation.log()
## # of features: 11 
## niter: 1000
## nfeatures : 11 
## evaluation_log:
##     iter training_rmse
##        1    2.90173070
##        2    2.85470977
## ---                   
##      999    0.09365539
##     1000    0.09359102
vip::vip(final_model)

best_params
## # A tibble: 1 × 3
##   tree_depth learn_rate .config              
##        <int>      <dbl> <chr>                
## 1          4     0.0178 Preprocessor1_Model12

КАЧЕСТВО МОДЕЛИ МНОЖЕСТВЕННОЙ ЛИНЕЙНОЙ РЕГРЕССИИ С ВКЛЮЧЕНИЕМ ФИКСИРОВАННЫХ ЭФФЕКТОВ

datapanel = filter(dataPanel[,-c(8, 9, 20)], year != "2016")
# Predict new data
predictions_fixed_1 <- predict(ols3,
                       new_data = datapanel[,-c(1:5)])
# Compute the mean absolute error using one single function
mae_fixed_1 = mae(cbind(datapanel[,-c(1:5)], predictions_fixed_1),
    truth = price_to_book,
    estimate = predictions_fixed_1)
# Compute the RMSE using a function
rmse_fixed_1 = rmse(cbind(datapanel[,-c(1:5)], predictions_fixed_1),
    truth = price_to_book,
    estimate = predictions_fixed_1)
rsq_fixed_1 = rsq_trad(cbind(datapanel[,-c(1:5)], predictions_fixed_1),
    truth = price_to_book,
    estimate = predictions_fixed_1)

# Print errors
mae_fixed_1["model"] = "fixed_1_1"
#mae_fixedtime
rmse_fixed_1["model"] = "fixed_1_1"
#rmse_fixedtime
rsq_fixed_1["model"] = "fixed_1_1"
#rsq_fixedtime
fixed_evaluation_1 = rbind(mae_fixed_1, rmse_fixed_1, rsq_fixed_1)
fixed_evaluation_1
## # A tibble: 3 × 4
##   .metric  .estimator .estimate model    
##   <chr>    <chr>          <dbl> <chr>    
## 1 mae      standard       0.359 fixed_1_1
## 2 rmse     standard       0.520 fixed_1_1
## 3 rsq_trad standard       0.868 fixed_1_1
# Predict new data
predictions_fixed_2 <- predict(ols4,
                       new_data = datapanel[,-c(1:5)])
# Compute the mean absolute error using one single function
mae_fixed_2 = mae(cbind(datapanel[,-c(1:5)], predictions_fixed_2),
    truth = price_to_book,
    estimate = predictions_fixed_2)
# Compute the RMSE using a function
rmse_fixed_2 = rmse(cbind(datapanel[,-c(1:5)], predictions_fixed_2),
    truth = price_to_book,
    estimate = predictions_fixed_2)
rsq_fixed_2 = rsq_trad(cbind(datapanel[,-c(1:5)], predictions_fixed_2),
    truth = price_to_book,
    estimate = predictions_fixed_2)

# Print errors
mae_fixed_2["model"] = "fixed_1_2"
#mae_fixedtime
rmse_fixed_2["model"] = "fixed_1_2"
#rmse_fixedtime
rsq_fixed_2["model"] = "fixed_1_2"
#rsq_fixedtime
fixed_evaluation_2 = rbind(mae_fixed_2, rmse_fixed_2, rsq_fixed_2)
fixed_evaluation_2
## # A tibble: 3 × 4
##   .metric  .estimator .estimate model    
##   <chr>    <chr>          <dbl> <chr>    
## 1 mae      standard       0.352 fixed_1_2
## 2 rmse     standard       0.508 fixed_1_2
## 3 rsq_trad standard       0.874 fixed_1_2

КАЧЕСТВО МОДЕЛИ БУСТИНГА СЛУЧАЙНОГО ЛЕСА

diplom_1802_w_imp = read.csv('/Users/olesyamba/Downloads/data/diplom_1802_w_imp.csv')
dataPanel <- pdata.frame(diplom_1802_w_imp, index=c("ticker","year"))
datapanel_test = filter(dataPanel[,-c(8, 9, 20)], year == "2016" | year =="2017")
# Predict new data
predictions_boostforest <- predict(final_model,
                       new_data = datapanel_test[,-c(1:5)])
# Compute the mean absolute error using one single function
mae_boostforest = mae(predictions_boostforest,
    truth = datapanel_test[,-c(1:5)]$price_to_book,
    estimate = .pred)
# Compute the RMSE using a function
rmse_boostforest = rmse(predictions_boostforest,
    truth = datapanel_test[,-c(1:5)]$price_to_book,
    estimate = .pred)
rsq_boostforest = rsq_trad(predictions_boostforest,
    truth = datapanel_test[,-c(1:5)]$price_to_book,
    estimate = .pred)

# Print errors
mae_boostforest["model"] = "boosted_random_forest"
#mae_fixedtime
rmse_boostforest["model"] = "boosted_random_forest"
#rmse_fixedtime
rsq_boostforest["model"] = "boosted_random_forest"
#rsq_fixedtime
boostforest_evaluation = rbind(mae_boostforest, rmse_boostforest, rsq_boostforest)
boostforest_evaluation
## # A tibble: 3 × 4
##   .metric  .estimator .estimate model                
##   <chr>    <chr>          <dbl> <chr>                
## 1 mae      standard       0.360 boosted_random_forest
## 2 rmse     standard       0.596 boosted_random_forest
## 3 rsq_trad standard       0.692 boosted_random_forest
evaluation_metrics = rbind(fixed_evaluation_1, fixed_evaluation_2, boostforest_evaluation)
colnames(evaluation_metrics) = c("Metric", "Estimator", "Estimation", "Model")
evaluation_metrics
## # A tibble: 9 × 4
##   Metric   Estimator Estimation Model                
##   <chr>    <chr>          <dbl> <chr>                
## 1 mae      standard       0.359 fixed_1_1            
## 2 rmse     standard       0.520 fixed_1_1            
## 3 rsq_trad standard       0.868 fixed_1_1            
## 4 mae      standard       0.352 fixed_1_2            
## 5 rmse     standard       0.508 fixed_1_2            
## 6 rsq_trad standard       0.874 fixed_1_2            
## 7 mae      standard       0.360 boosted_random_forest
## 8 rmse     standard       0.596 boosted_random_forest
## 9 rsq_trad standard       0.692 boosted_random_forest

АНАЛИЗ УСТОЙЧИВОСТИ

diplom_1802_w_imp = read.csv('/Users/olesyamba/Downloads/data/diplom_1802_w_imp.csv')
diplom_1802_w_imp1 = filter(diplom_1802_w_imp, year == "2017"|year == "2018")
diplom_1802_w_imp2 = filter(diplom_1802_w_imp, year == "2019"|year == "2020")
diplom_1802_w_imp3 = filter(diplom_1802_w_imp, year == "2021"|year == "2022")
dataPanel1 <- pdata.frame(diplom_1802_w_imp1, index=c("ticker","year"))
dataPanel2 <- pdata.frame(diplom_1802_w_imp2, index=c("ticker","year"))
dataPanel3 <- pdata.frame(diplom_1802_w_imp3, index=c("ticker","year"))

МОДЕЛЬ МНОЖЕСТВЕННОЙ ЛИНЕЙНОЙ РЕГРЕССИИ

fixed1_1 <- plm(price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  google_trends + number_of_news + number_of_tweets + sales +  ROA + fin_leverage + buyback_yield, data = dataPanel1, model="within")
summary(fixed1_1)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = price_to_book ~ polarity_news + polarity_twitter + 
##     yoy_revenue_growth + google_trends + number_of_news + number_of_tweets + 
##     sales + ROA + fin_leverage + buyback_yield, data = dataPanel1, 
##     model = "within")
## 
## Balanced Panel: n = 102, T = 2, N = 204
## 
## Residuals:
##        Min.     1st Qu.      Median     3rd Qu.        Max. 
## -1.0268e+00 -1.6253e-01  4.5146e-16  1.6253e-01  1.0268e+00 
## 
## Coefficients:
##                       Estimate  Std. Error t-value  Pr(>|t|)    
## polarity_news      14.37122909  6.14715589  2.3379  0.021562 *  
## polarity_twitter   -0.36059325  3.01081848 -0.1198  0.904930    
## yoy_revenue_growth  0.07705016  0.11240291  0.6855  0.494764    
## google_trends      -0.00027423  0.00140911 -0.1946  0.846127    
## number_of_news      0.18301852  0.05625504  3.2534  0.001596 ** 
## number_of_tweets   -0.01387097  0.00632290 -2.1938  0.030772 *  
## sales              -0.01402590  0.01022547 -1.3717  0.173504    
## ROA                 0.13511819  0.01052098 12.8427 < 2.2e-16 ***
## fin_leverage       -0.01282421  0.02920037 -0.4392  0.661561    
## buyback_yield      -0.00057185  0.02119325 -0.0270  0.978532    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    72.093
## Residual Sum of Squares: 19.526
## R-Squared:      0.72916
## Adj. R-Squared: 0.40238
## F-statistic: 24.7681 on 10 and 92 DF, p-value: < 2.22e-16
fixed1_2 <- plm(price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  google_trends + number_of_news + number_of_likes_twitter + sales +  ROA + fin_leverage + buyback_yield, data = dataPanel1, model="within")
summary(fixed1_2)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = price_to_book ~ polarity_news + polarity_twitter + 
##     yoy_revenue_growth + google_trends + number_of_news + number_of_likes_twitter + 
##     sales + ROA + fin_leverage + buyback_yield, data = dataPanel1, 
##     model = "within")
## 
## Balanced Panel: n = 102, T = 2, N = 204
## 
## Residuals:
##        Min.     1st Qu.      Median     3rd Qu.        Max. 
## -9.6340e-01 -1.6344e-01 -1.6523e-16  1.6344e-01  9.6340e-01 
## 
## Coefficients:
##                           Estimate Std. Error t-value Pr(>|t|)    
## polarity_news           14.8841081  6.3465892  2.3452  0.02117 *  
## polarity_twitter         0.8663237  3.0445566  0.2845  0.77663    
## yoy_revenue_growth       0.0012769  0.1144324  0.0112  0.99112    
## google_trends           -0.0010243  0.0014198 -0.7214  0.47247    
## number_of_news           0.0915475  0.0514224  1.7803  0.07833 .  
## number_of_likes_twitter  0.0247709  0.0248682  0.9961  0.32182    
## sales                    0.0023871  0.0097863  0.2439  0.80783    
## ROA                      0.1420189  0.0115497 12.2963  < 2e-16 ***
## fin_leverage            -0.0062061  0.0298837 -0.2077  0.83594    
## buyback_yield            0.0060592  0.0215155  0.2816  0.77887    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    72.093
## Residual Sum of Squares: 20.328
## R-Squared:      0.71803
## Adj. R-Squared: 0.37783
## F-statistic: 23.4277 on 10 and 92 DF, p-value: < 2.22e-16
fixed2_1 <- plm(price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  google_trends + number_of_news + number_of_tweets + sales +  ROA + fin_leverage + buyback_yield, data = dataPanel2, model="within")
summary(fixed2_1)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = price_to_book ~ polarity_news + polarity_twitter + 
##     yoy_revenue_growth + google_trends + number_of_news + number_of_tweets + 
##     sales + ROA + fin_leverage + buyback_yield, data = dataPanel2, 
##     model = "within")
## 
## Balanced Panel: n = 102, T = 2, N = 204
## 
## Residuals:
##        Min.     1st Qu.      Median     3rd Qu.        Max. 
## -1.3231e+00 -1.3635e-01 -2.8428e-16  1.3635e-01  1.3231e+00 
## 
## Coefficients:
##                       Estimate  Std. Error t-value  Pr(>|t|)    
## polarity_news       1.6854e+01  7.0282e+00  2.3980   0.01850 *  
## polarity_twitter   -5.2471e+00  3.9080e+00 -1.3427   0.18268    
## yoy_revenue_growth -8.5357e-03  2.3717e-02 -0.3599   0.71975    
## google_trends      -6.4638e-05  7.4014e-04 -0.0873   0.93060    
## number_of_news      5.1190e-02  4.5631e-02  1.1218   0.26485    
## number_of_tweets   -3.5223e-03  3.3878e-03 -1.0397   0.30119    
## sales              -1.9026e-02  8.5391e-03 -2.2282   0.02831 *  
## ROA                 1.6573e-01  1.6697e-02  9.9260 3.267e-16 ***
## fin_leverage       -7.3582e-02  2.3992e-02 -3.0669   0.00284 ** 
## buyback_yield       3.0785e-02  2.3702e-02  1.2988   0.19725    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    38.473
## Residual Sum of Squares: 16.686
## R-Squared:      0.5663
## Adj. R-Squared: 0.043031
## F-statistic: 12.0128 on 10 and 92 DF, p-value: 5.1753e-13
fixed2_2 <- plm(price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  google_trends + number_of_news + number_of_likes_twitter + sales +  ROA + fin_leverage + buyback_yield, data = dataPanel2, model="within")
summary(fixed2_2)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = price_to_book ~ polarity_news + polarity_twitter + 
##     yoy_revenue_growth + google_trends + number_of_news + number_of_likes_twitter + 
##     sales + ROA + fin_leverage + buyback_yield, data = dataPanel2, 
##     model = "within")
## 
## Balanced Panel: n = 102, T = 2, N = 204
## 
## Residuals:
##        Min.     1st Qu.      Median     3rd Qu.        Max. 
## -1.3208e+00 -1.2727e-01 -5.6606e-16  1.2727e-01  1.3208e+00 
## 
## Coefficients:
##                            Estimate  Std. Error t-value  Pr(>|t|)    
## polarity_news           17.26980244  7.06806630  2.4434  0.016460 *  
## polarity_twitter        -5.07417818  4.00467520 -1.2671  0.208331    
## yoy_revenue_growth      -0.01266937  0.02361684 -0.5365  0.592939    
## google_trends           -0.00021547  0.00073264 -0.2941  0.769343    
## number_of_news           0.03085816  0.04530642  0.6811  0.497520    
## number_of_likes_twitter  0.00138078  0.00688275  0.2006  0.841442    
## sales                   -0.01672805  0.00835767 -2.0015  0.048282 *  
## ROA                      0.16997193  0.01676548 10.1382 < 2.2e-16 ***
## fin_leverage            -0.07194695  0.02411252 -2.9838  0.003645 ** 
## buyback_yield            0.02792485  0.02372115  1.1772  0.242147    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    38.473
## Residual Sum of Squares: 16.874
## R-Squared:      0.5614
## Adj. R-Squared: 0.03221
## F-statistic: 11.7756 on 10 and 92 DF, p-value: 8.3955e-13
fixed3_1 <- plm(price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  google_trends + number_of_news + number_of_tweets + sales +  ROA + fin_leverage + buyback_yield, data = dataPanel3, model="within")
summary(fixed3_1)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = price_to_book ~ polarity_news + polarity_twitter + 
##     yoy_revenue_growth + google_trends + number_of_news + number_of_tweets + 
##     sales + ROA + fin_leverage + buyback_yield, data = dataPanel3, 
##     model = "within")
## 
## Balanced Panel: n = 102, T = 2, N = 204
## 
## Residuals:
##        Min.     1st Qu.      Median     3rd Qu.        Max. 
## -6.3761e-01 -1.3421e-01  1.6827e-16  1.3421e-01  6.3761e-01 
## 
## Coefficients:
##                       Estimate  Std. Error t-value  Pr(>|t|)    
## polarity_news       4.9942e+01  1.1329e+01  4.4081  2.82e-05 ***
## polarity_twitter   -2.8625e+00  3.1879e+00 -0.8979 0.3715737    
## yoy_revenue_growth -6.1430e-02  1.5659e-02 -3.9230 0.0001684 ***
## google_trends      -2.8239e-05  5.3186e-03 -0.0053 0.9957752    
## number_of_news      8.9655e-03  3.0480e-02  0.2941 0.7693114    
## number_of_tweets   -7.5085e-06  3.3123e-03 -0.0023 0.9981962    
## sales              -2.9688e-02  1.2494e-02 -2.3762 0.0195655 *  
## ROA                 1.7542e-01  1.0245e-02 17.1230 < 2.2e-16 ***
## fin_leverage       -9.7198e-03  1.7936e-02 -0.5419 0.5891945    
## buyback_yield       5.5794e-02  2.4152e-02  2.3101 0.0231191 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    48.502
## Residual Sum of Squares: 9.0925
## R-Squared:      0.81253
## Adj. R-Squared: 0.58635
## F-statistic: 39.8753 on 10 and 92 DF, p-value: < 2.22e-16
fixed3_2 <- plm(price_to_book ~ polarity_news + polarity_twitter + yoy_revenue_growth +  google_trends + number_of_news + number_of_likes_twitter + sales +  ROA + fin_leverage + buyback_yield, data = dataPanel3, model="within")
summary(fixed3_2)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = price_to_book ~ polarity_news + polarity_twitter + 
##     yoy_revenue_growth + google_trends + number_of_news + number_of_likes_twitter + 
##     sales + ROA + fin_leverage + buyback_yield, data = dataPanel3, 
##     model = "within")
## 
## Balanced Panel: n = 102, T = 2, N = 204
## 
## Residuals:
##        Min.     1st Qu.      Median     3rd Qu.        Max. 
## -6.3721e-01 -1.3447e-01  1.5873e-16  1.3447e-01  6.3721e-01 
## 
## Coefficients:
##                            Estimate  Std. Error t-value  Pr(>|t|)    
## polarity_news            4.9996e+01  1.1189e+01  4.4683 2.241e-05 ***
## polarity_twitter        -3.0370e+00  3.3248e+00 -0.9134 0.3634047    
## yoy_revenue_growth      -6.0963e-02  1.5708e-02 -3.8809 0.0001955 ***
## google_trends            2.2066e-05  5.1682e-03  0.0043 0.9966026    
## number_of_news           1.0572e-02  2.3456e-02  0.4507 0.6532559    
## number_of_likes_twitter -9.4426e-04  6.3769e-03 -0.1481 0.8826073    
## sales                   -3.0066e-02  1.2236e-02 -2.4572 0.0158772 *  
## ROA                      1.7514e-01  1.0238e-02 17.1072 < 2.2e-16 ***
## fin_leverage            -9.6844e-03  1.7763e-02 -0.5452 0.5869370    
## buyback_yield            5.5892e-02  2.4141e-02  2.3152 0.0228244 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    48.502
## Residual Sum of Squares: 9.0904
## R-Squared:      0.81258
## Adj. R-Squared: 0.58645
## F-statistic: 39.887 on 10 and 92 DF, p-value: < 2.22e-16
datapanel1 = dataPanel1[,-c(8, 9, 16:20)]
datapanel2 = dataPanel2[,-c(8, 9, 16:20)]
datapanel3 = dataPanel3[,-c(8, 9, 16:20)]
# Train the final model on the full training data
final_model1 <- final_spec %>% fit(formula = price_to_book ~ ., 
                                  data = datapanel1[,-c(1:5)])
final_model2 <- final_spec %>% fit(formula = price_to_book ~ ., 
                                  data = datapanel2[,-c(1:5)])
final_model3 <- final_spec %>% fit(formula = price_to_book ~ ., 
                                  data = datapanel3[,-c(1:5)])

vip::vip(final_model1)

vip::vip(final_model2)

vip::vip(final_model3)

h1 = vip::vi_model(final_model1)
h1 = h1 %>% 
  mutate(model = c(1,1,1,1,1,1,1))
h2 = vip::vi_model(final_model2)
h2 = h2 %>% 
  mutate(model = c(2,2,2,2,2,2,2))
h3 = vip::vi_model(final_model3)
h3 = h3 %>% 
  mutate(model = c(3,3,3,3,3,3,3))
h = rbind(h1,h2,h3)
colnames(h) = c("Variable", "Importance", "Model")
h
## # A tibble: 21 × 3
##    Variable                Importance Model
##    <chr>                        <dbl> <dbl>
##  1 number_of_tweets            0.457      1
##  2 polarity_news               0.231      1
##  3 google_trends               0.100      1
##  4 yoy_revenue_growth          0.0653     1
##  5 number_of_likes_twitter     0.0588     1
##  6 polarity_twitter            0.0483     1
##  7 number_of_news              0.0398     1
##  8 number_of_tweets            0.467      2
##  9 polarity_news               0.320      2
## 10 number_of_likes_twitter     0.0611     2
## # … with 11 more rows
p = ggplot(h, aes(Variable, Importance, group = Model)) 
p + geom_line(aes(colour = Model)) + geom_point(aes(colour = Model))