Para replicar las secciones de esta clase, primero debe descargar el
siguiente proyecto
de R y abrir el archivo clase-06.Rproj
.
## Instalar/llamar las librerías de la clase
require(pacman)
p_load(tidyverse,rio,janitor,
# CARET: Classification and Regression Training
caret,
glmnet,# Elastic Net elasticnet)
set.seed(12345)
<- import("output/data_regresiones.rds") %>% na.omit() %>% select(-id) %>% clean_names() db
### tibble to matrix
<- model.matrix(~.+ I(surface_total^2) + I(dist_cbd^2) -1 -price , db) %>% as_tibble() %>% clean_names()
df
<- scale(x = df) %>% as_tibble()
df
$price <- db$price/1000000 %>% unlist()
df
### partir la muestra
<- sample(x = nrow(df) , size = nrow(df)*.3)
sample_test
<- df[sample_test,]
test
<- df[-sample_test,] train
## Linear Regression
<- lm(price~.,data = train)
lm_model
## results
summary(lm_model)
##
## Call:
## lm(formula = price ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53054 -263 -73 111 59404
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 834.735 5.570 149.856 < 2e-16 ***
## rooms 24.870 5.485 4.534 5.80e-06 ***
## bathrooms 413.460 6.281 65.828 < 2e-16 ***
## surface_total 1018.031 17.205 59.169 < 2e-16 ***
## property_type_apartamento -40.223 203.683 -0.197 0.84345
## property_type_casa -103.099 184.031 -0.560 0.57533
## property_type_deposito -49.716 27.819 -1.787 0.07392 .
## property_type_finca -24.407 10.769 -2.266 0.02343 *
## property_type_local_comercial 78.329 40.783 1.921 0.05478 .
## property_type_lote 75.053 19.066 3.936 8.28e-05 ***
## property_type_oficina 70.257 58.948 1.192 0.23333
## property_type_otro 29.382 124.291 0.236 0.81312
## property_type_parqueadero -21.764 15.097 -1.442 0.14941
## property_type_ph NA NA NA NA
## dist_cbd 61.458 23.783 2.584 0.00977 **
## dist_cole -74.507 5.846 -12.745 < 2e-16 ***
## dist_park 9.638 6.320 1.525 0.12724
## luces_2019 -1884.318 1823.068 -1.034 0.30133
## luces_2021 1577.859 1540.067 1.025 0.30559
## cambio_activ -446.308 434.588 -1.027 0.30444
## i_surface_total_2 289.952 65.188 4.448 8.69e-06 ***
## i_dist_cbd_2 -172.005 24.886 -6.912 4.85e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1172 on 44334 degrees of freedom
## Multiple R-squared: 0.4019, Adjusted R-squared: 0.4016
## F-statistic: 1489 on 20 and 44334 DF, p-value: < 2.2e-16
lm_model
##
## Call:
## lm(formula = price ~ ., data = train)
##
## Coefficients:
## (Intercept) rooms
## 834.735 24.870
## bathrooms surface_total
## 413.460 1018.031
## property_type_apartamento property_type_casa
## -40.223 -103.099
## property_type_deposito property_type_finca
## -49.716 -24.407
## property_type_local_comercial property_type_lote
## 78.329 75.053
## property_type_oficina property_type_otro
## 70.257 29.382
## property_type_parqueadero property_type_ph
## -21.764 NA
## dist_cbd dist_cole
## 61.458 -74.507
## dist_park luces_2019
## 9.638 -1884.318
## luces_2021 cambio_activ
## 1577.859 -446.308
## i_surface_total_2 i_dist_cbd_2
## 289.952 -172.005
## predictions
$price_lm <- predict(lm_model , newdata=test) test
## Warning in predict.lm(lm_model, newdata = test): prediction from a rank-
## deficient fit may be misleading
## Problema: Overfit
# + Regresion: L(Y_observado,Y_predicho) = (Y_observado - Y_predicho)^2
# + Modelos muy complejos tienden a predecir bien dentro de la muestra y mal fuera de ella
# + R2 no funciona: mide predicción dentro de la muestra, no decreciente en complejidad
## Posible solución: K-fold validation
## Algoritmo de K-fold validation
# + Partir el conjunto de datos en K partes
# + Ajustar el modelo dejando particiones afuera
# + Computar el error de predicción para los datos de test (datos no usados en la estimación)
# + Repetir para k=1,...K
## Observaciones de K-fold validation
# + El modelo se estima 1 sola vez con las N observaciones, pero se estima RMSE para cada particion
# + Cada observación es usada como entrenamiento K-1 vez y como test 1 vez
# + Qué pasa si K=1? (Modelo econometria)
# + Qué pasa si K=N? (Leave One-Out Croos Validation)
## Cómo elegir el K?
# + K -> 0: Máximiza datos para estimar (más presición) pero es sensible a valores particulares.
# + K -> N: Máximiza datos para evaluar pero el modelo es estimado con menor presición.
## estimacion
<- train(price~.,
cv_5 data = train,
trControl = trainControl(method = "cv", number = 5),
method = "lm")
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## results
cv_5
## Linear Regression
##
## 44355 samples
## 21 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 35486, 35483, 35484, 35483, 35484
## Resampling results:
##
## RMSE Rsquared MAE
## 1184.137 0.4056361 396.8995
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(cv_5)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53054 -263 -73 111 59404
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 834.735 5.570 149.856 < 2e-16 ***
## rooms 24.870 5.485 4.534 5.80e-06 ***
## bathrooms 413.460 6.281 65.828 < 2e-16 ***
## surface_total 1018.031 17.205 59.169 < 2e-16 ***
## property_type_apartamento -40.223 203.683 -0.197 0.84345
## property_type_casa -103.099 184.031 -0.560 0.57533
## property_type_deposito -49.716 27.819 -1.787 0.07392 .
## property_type_finca -24.407 10.769 -2.266 0.02343 *
## property_type_local_comercial 78.329 40.783 1.921 0.05478 .
## property_type_lote 75.053 19.066 3.936 8.28e-05 ***
## property_type_oficina 70.257 58.948 1.192 0.23333
## property_type_otro 29.382 124.291 0.236 0.81312
## property_type_parqueadero -21.764 15.097 -1.442 0.14941
## property_type_ph NA NA NA NA
## dist_cbd 61.458 23.783 2.584 0.00977 **
## dist_cole -74.507 5.846 -12.745 < 2e-16 ***
## dist_park 9.638 6.320 1.525 0.12724
## luces_2019 -1884.318 1823.068 -1.034 0.30133
## luces_2021 1577.859 1540.067 1.025 0.30559
## cambio_activ -446.308 434.588 -1.027 0.30444
## i_surface_total_2 289.952 65.188 4.448 8.69e-06 ***
## i_dist_cbd_2 -172.005 24.886 -6.912 4.85e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1172 on 44334 degrees of freedom
## Multiple R-squared: 0.4019, Adjusted R-squared: 0.4016
## F-statistic: 1489 on 20 and 44334 DF, p-value: < 2.2e-16
## k-folds
nrow(train)
## [1] 44355
nrow(train)/5
## [1] 8871
nrow(train)-nrow(train)/5
## [1] 35484
## predictions
$price_cv5 <- predict(cv_5 , newdata=test) test
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## predictores
<- train %>% select(-price) %>% as.matrix()
x <- train$price
y
## estimations
<- glmnet(x=x , y=y , alpha=1) ## (alpha = 1 para lasso)
lasso
dim(coef(lasso)) ## Número de variables y modelos
## [1] 22 67
## plot model
plot(x=lasso, xvar="lambda" , label=TRUE)
### **2.1 Coeficientes y lambdas**
## coeficientes
<- coef(lasso)
coef_lasso
## coeficientes modelo 15
15] coef_lasso[,
## (Intercept) rooms
## 833.1283 0.0000
## bathrooms surface_total
## 206.5318 828.0427
## property_type_apartamento property_type_casa
## 0.0000 0.0000
## property_type_deposito property_type_finca
## 0.0000 0.0000
## property_type_local_comercial property_type_lote
## 0.0000 0.0000
## property_type_oficina property_type_otro
## 0.0000 0.0000
## property_type_parqueadero property_type_ph
## 0.0000 0.0000
## dist_cbd dist_cole
## 0.0000 0.0000
## dist_park luces_2019
## 0.0000 0.0000
## luces_2021 cambio_activ
## 0.0000 0.0000
## i_surface_total_2 i_dist_cbd_2
## 0.0000 0.0000
## lambdas
<- lasso$lambda
lambdas
lambdas
## [1] 842.886967 768.007212 699.779568 637.613080 580.969291 529.357581
## [7] 482.330912 439.481963 400.439597 364.865647 332.451988 302.917870
## [13] 276.007481 251.487737 229.146260 208.789538 190.241251 173.340742
## [19] 157.941628 143.910529 131.125915 119.477050 108.863038 99.191946
## [25] 90.380007 82.350896 75.035069 68.369161 62.295433 56.761278
## [31] 51.718763 47.124210 42.937825 39.123347 35.647737 32.480890
## [37] 29.595378 26.966206 24.570602 22.387818 20.398946 18.586760
## [43] 16.935563 15.431054 14.060202 12.811132 11.673027 10.636027
## [49] 9.691152 8.830217 8.045765 7.331001 6.679735 6.086326
## [55] 5.545633 5.052975 4.604082 4.195068 3.822390 3.482819
## [61] 3.173415 2.891498 2.634625 2.400572 2.187312 1.992997
## [67] 1.815945
15] lambdas[
## [1] 229.1463
## Para saber el lambda modelo N
plot(x = lasso , xvar = "lambda" , label = TRUE)
abline(v=log(lambdas[15]), col="blue",lwd= 4,lty =3)
### **2.2 Predicciones**
## predicciones
<- test %>% select(colnames(x)) %>% as.matrix()
test_matrix
$price_lasso = predict(object=lasso , s=lasso$lambda[15] , newx=test_matrix)
test
$price_lasso %>% head() test
## s1
## [1,] 431.5073
## [2,] 1002.3586
## [3,] 435.8589
## [4,] 591.3300
## [5,] 404.3099
## [6,] 965.3702
## Mejor lambda con validación cruzada
<- cv.glmnet(x = x , y = y , alpha=1 , nfolds = 10)
lasso_cv
plot(lasso_cv)
= lasso_cv$lambda.min
min_lambda
log(min_lambda)
## [1] 4.131888
## hacer predicción con mejor lambda
coef(lasso)[,which(lasso$lambda==min_lambda)]
## (Intercept) rooms
## 834.137474 0.000000
## bathrooms surface_total
## 346.385016 1020.857724
## property_type_apartamento property_type_casa
## 0.000000 -16.699912
## property_type_deposito property_type_finca
## 0.000000 0.000000
## property_type_local_comercial property_type_lote
## 27.143348 27.018594
## property_type_oficina property_type_otro
## 22.718075 1.602731
## property_type_parqueadero property_type_ph
## 0.000000 0.000000
## dist_cbd dist_cole
## 0.000000 -33.097949
## dist_park luces_2019
## 0.000000 0.000000
## luces_2021 cambio_activ
## 0.000000 0.000000
## i_surface_total_2 i_dist_cbd_2
## 0.000000 -68.070520
$price_lasso_cv = predict(object=lasso , s=min_lambda , newx=test_matrix)
test
$price_lasso_cv %>% head() test
## s1
## [1,] 238.0806
## [2,] 1105.4547
## [3,] 293.6746
## [4,] 448.3066
## [5,] 294.8577
## [6,] 1094.6151
### **2.2 Tunnear el modelo**
<- glmnet(x=x , y=y , alpha=1) ## (alpha = 1 para lasso) lasso