library(tidyverse)
library(caret)
library(glmnet)
#load data and clean data
#jiazai shuju
Data <- read_excel('D:\\4 UCI\\Machine Learning\\HW1\\Iowa_Housing_Data_Mod.xlsx')
Data$`Age of House` <- Data$YrSold - Data$YearBuilt
Data$`CentralAC Dummy` <- ifelse(Data$CentralAir == 'Y', 1, 0)
Data <- Data %>% select(-c(Unnamed.28, Unnamed.29, CentralAir, YrSold, YearBuilt, Id)) %>% drop_na()
# First 1800 data items are training set; the next 600 are the validation set: the final 508 are the etst set
train <- Data[1:1800,]
val <- Data[1801:2400,]
test <- Data[2401:2908,]
#Simple Linear regression
#two methods
#method 1
Y_simple_1 <- as.matrix(Data$SalePrice)
X_simple_1 <- as.matrix(Data$GrLivArea)
X_simple_1 <- X_simple_1 %>% t() %>% as.data.frame()
Y_simple_1 <- Y_simple_1 %>% t() %>% as.data.frame()
lr_simple_1 <- train(X_simple_1, Y_simple_1, method = "lm")
#coefficient of variables
lr_simple_1$finalModel$coefficients
#intercept
lr_simple_1$finalModel$coef[1]
#R-squared
R_2_s <- cor(Y_simple_1, lr_simple_1$finalModel$fitted.values)^2
#Adjusted R-squared
R_2_s_adjusted <- 1-(1-R_2_s)*(2908-1)/(2908-1-1)
#method 2
Y_simple_2 <- Data$SalePrice
X_simple_2 <- Data$GrLivArea
X_simple_2 <- cbind(1, X_simple_2)
ols <- lm(Y_simple_2 ~ X_simple_2)
lr_simple_2 <- summary(ols)
#multiple Linear regression
#two methods
#method 1
Y_multiple_1 <- as.matrix(Data$SalePrice)
X_multiple_1 <- as.matrix(Data %>% select(LotArea, OverallQual, OverallCond, `Age of House