learning resources

proof of least squares solution

https://math.stackexchange.com/questions/131590/derivation-of-the-formula-for-ordinary-least-squares-linear-regression

linear regression using the linear model (lm) function

how to obtain coefficients (intercept and slope for a linear model)

library(UsingR)
library(ggplot2)

# generate some fake data
set.seed(1234)
beta <- 2
intercept <- 10
n <- 50
m <- 10
s <- 10
noise <- rnorm(n, mean = m, sd = s)
observed <- runif(n = 50, min = 1, max = 100 )
outcome <- beta*observed + rep(intercept, n) + noise
fake <- as.data.frame(cbind(observed, outcome))

# plot the data
plot(fake$observed, fake$outcome,  
     xlab = "observed (units)", 
     ylab = "outcome (units)", 
     bg = "lightblue", 
     col = "black", cex = 1.1, pch = 21,frame = FALSE)


# calculate the coefficients of the linear model
fit <- lm(outcome ~ observed, data = fake)


#plot the regression line and the predicted points
abline(fit, lwd = 2)
points(fake$observed, predict(fit), pch = 19, col = "red") 

# examine the coefficients and the generated model
fit

Call:
lm(formula = outcome ~ observed, data = fake)

Coefficients:
(Intercept)     observed  
     14.529        2.018  
coef(fit)
(Intercept)    observed 
  14.528576    2.017873 
summary(fit)

Call:
lm(formula = outcome ~ observed, data = fake)

Residuals:
    Min      1Q  Median      3Q     Max 
-18.365  -5.018  -1.018   4.001  28.201 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  14.5286     2.5597   5.676 7.81e-07 ***
observed      2.0179     0.0423  47.703  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.926 on 48 degrees of freedom
Multiple R-squared:  0.9793,    Adjusted R-squared:  0.9789 
F-statistic:  2276 on 1 and 48 DF,  p-value: < 2.2e-16
# to show the following 6) plots
# 1) a plot of residuals against fitted values (should be uncorrelated with fitted (observed) values), 
# 2) a Scale-Location plot of sqrt(| residuals |) against fitted values, 
# 3) a Normal Q-Q plot (residuals (error terms) are assumed to follow a normal distribution for many tests), 
# 4) a plot of Cook's distances versus row labels, 
# 5) a plot of residuals against leverages, and 
# 6) a plot of Cook's distances against leverage/(1-leverage). 
# see methods(plot) then ?plot.lm for more
plot(fit, which = c(1:6))

regression on centred data will give intercept that is y value for mean value of x

#the I notation is a shortcut to allow evaluation of variable in-line with lm call
#fit2 <- lm(outcome ~ I(observed - mean(observed)), data = fake)
observed.c <- observed - mean(observed)
fit2 <- lm(outcome ~ observed.c, data = fake)
coef(fit2)
## (Intercept)  observed.c 
##  120.756980    2.017873
plot(observed.c, fake$outcome,  
     xlab = "observed (units)", 
     ylab = "outcome (units)", 
     bg = "lightblue", 
     col = "black", cex = 1.1, pch = 21,frame = FALSE)




#plot the regression line and the predicted points
abline(fit2, lwd = 2)
points(observed.c, predict(fit2), pch = 19, col = "red") 

# examine the coefficients and the generated model
coef(fit2)
## (Intercept)  observed.c 
##  120.756980    2.017873
fit2
## 
## Call:
## lm(formula = outcome ~ observed.c, data = fake)
## 
## Coefficients:
## (Intercept)   observed.c  
##     120.757        2.018

making predictions with a model

# x = observations for which we want to make predictions using the generated model
x <- c(10,50,90)
# x must be passed as a column named after the expected predictor variable in a df
predict(fit, newdata = data.frame(observed = x))
##        1        2        3 
##  34.7073 115.4222 196.1371

plotting regression lines in ggplot

https://www.rstudio.com/wp-content/uploads/2015/03/ggplot2-cheatsheet.pdf

g <- ggplot(fake, aes(x=observed, y=outcome))
g <- g + xlab("observed (units)")
g <- g + ylab("outcome (units)")
g <- g + geom_point(size = 2, colour = "black", alpha = 0.4)
g <- g + geom_point(size = 1, colour = "blue", alpha = 0.2)
g <- g + geom_smooth(method="lm", colour="black")
g

plotting interactive regression plot in plotly

https://plot.ly/ggplot2/geom_abline/

library(plotly)
p <- ggplotly(g)
p

obtain residuals for a fit

e <- resid(fit)
#or
fit$residuals
##           1           2           3           4           5           6 
##  -6.6798441   7.2279911  15.8020713 -18.3647334   9.5081587   9.9378445 
##           7           8           9          10          11          12 
##  -0.5682171  -0.2427242  -0.9616028  -3.5152021  -0.5805001  -4.7086154 
##          13          14          15          16          17          18 
##  -3.9904686   5.8825933  14.6598285   2.7350679  -1.3301476  -4.1523455 
##          19          20          21          22          23          24 
##  -3.1366371  28.2013977   5.4775004  -1.0740435  -0.7117815   8.3820214 
##          25          26          27          28          29          30 
##  -2.3438252  -9.5300548  10.7560202  -5.6734718   4.4228302  -4.4693968 
##          31          32          33          34          35          36 
##  14.7739582  -0.4243626  -1.8663316  -0.3075739 -12.4551867  -7.0503589 
##          37          38          39          40          41          42 
## -17.9537706  -9.0140349   1.3927993  -0.7433233  19.0589422  -6.9733317 
##          43          44          45          46          47          48 
##  -3.6740678   1.7955705  -5.1215069  -5.3418585  -6.9318270  -8.0677362 
##          49          50 
##  -1.5201453  -0.5355678
#