Loading libraries

suppressPackageStartupMessages(library(tidyverse))

Reading data

# Spoken Tutorials:
# 5. Introduction to Data Frames in R
# 9. Indexing and Slicing Data Frames

# Source: https://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset
bike_sharing <- read_csv("day.csv")
## Rows: 731 Columns: 16
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl  (15): instant, season, yr, mnth, holiday, weekday, workingday, weathers...
## date  (1): dteday
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
bike_sharing <- bike_sharing[,-c(1:8,11,14:15)]
bike_sharing <- bike_sharing %>% filter(weathersit!=3)

1) Basic statistics of continuous variable

# Mean, standard deviation, minimum, maximum, range, quartiles, IQR, skewness and kurtosis
statistics <- function(x)
{
  mean <- mean(x)
  standard_deviation <- sd(x)
  min <- min(x)
  max <- max(x)
  range <- max - min
  first_quartile <- quantile(x)[2]
  third_quartile <- quantile(x)[4]
  IQR <- third_quartile - first_quartile
  skewness <- mean((x-mean(x))^3)/(sd(x)^3)
  kurtosis <- (mean((x-mean(x))^4))/sd(x)^4
  
  return(cat("\t","Mean: ",mean,"\n\t","Standard deviation: ",standard_deviation,"\n\t","Minimum value: ",min,"\n\t","Maximum value: ",max,"\n\t","Range: ",range,"\n\t","First Quartile: ",first_quartile,"\n\t","Third Quartile: ",third_quartile,"\n\t","Interquartile Range: ",IQR,"\n\t","Skewness: ",skewness,"\n\t","Kurtosis: ",kurtosis))
}

1.1) Basic statistics of the variable “temp” (Normalized temperature in Celsius)

statistics(bike_sharing$temp)
##   Mean:  0.497 
##   Standard deviation:  0.184 
##   Minimum value:  0.0591 
##   Maximum value:  0.862 
##   Range:  0.803 
##   First Quartile:  0.337 
##   Third Quartile:  0.657 
##   Interquartile Range:  0.321 
##   Skewness:  -0.0665 
##   Kurtosis:  1.86

1.2) Basic statistics of the variable “hum” (Normalized humidity)

statistics(bike_sharing$hum)
##   Mean:  0.621 
##   Standard deviation:  0.135 
##   Minimum value:  0.188 
##   Maximum value:  0.973 
##   Range:  0.785 
##   First Quartile:  0.518 
##   Third Quartile:  0.72 
##   Interquartile Range:  0.202 
##   Skewness:  -0.0296 
##   Kurtosis:  2.54

1.3) Basic statistics of the variable “windspeed” (Normalized wind speed)

statistics(bike_sharing$windspeed)
##   Mean:  0.189 
##   Standard deviation:  0.0767 
##   Minimum value:  0.0224 
##   Maximum value:  0.507 
##   Range:  0.485 
##   First Quartile:  0.134 
##   Third Quartile:  0.231 
##   Interquartile Range:  0.0966 
##   Skewness:  0.705 
##   Kurtosis:  3.53

1.4) Basic statistics of the variable “cnt” (Count of total rental bikes)

statistics(bike_sharing$cnt)
##   Mean:  4584 
##   Standard deviation:  1897 
##   Minimum value:  431 
##   Maximum value:  8714 
##   Range:  8283 
##   First Quartile:  3296 
##   Third Quartile:  6039 
##   Interquartile Range:  2743 
##   Skewness:  -0.0468 
##   Kurtosis:  2.19

2) Histogram (Probability density estimate)

2.1) Histogram of the variable “cnt” (Count of total rental bikes)

# Spoken Tutorials:
# 15. Plotting Histograms and Pie Chart 

# Distribution of "cnt" variables
hist(bike_sharing$cnt,breaks = 15,col="green",border="black",prob = TRUE,xlab = "",main = "Count of total rental bikes")
x <- seq(min(bike_sharing$cnt),max(bike_sharing$cnt),length=40)
y <- dnorm(x,mean=mean(bike_sharing$cnt),sd=sd(bike_sharing$cnt))
lines(x, y, col="red", lwd=2)

3) Correlation matrix among continuous variables

cor(bike_sharing[,-1])
##             temp     hum windspeed     cnt
## temp       1.000  0.1482    -0.146  0.6334
## hum        0.148  1.0000    -0.299 -0.0463
## windspeed -0.146 -0.2991     1.000 -0.2076
## cnt        0.633 -0.0463    -0.208  1.0000

4) Scatter plot and linear regression

4.1.1) Scatter plot between “temp” (Normalized temperature in Celsius) and “cnt” (Count of total rental bikes)

# Spoken Tutorials:
# 17. Introduction to ggplot2
# 18. Aesthetic Mapping in ggplot2

# "cnt" and "temp" variables
# Scatter plot
ggplot(data = bike_sharing,aes(y=cnt,x=temp)) + geom_point() + labs(x = "Normalized temperature in Celsius", y = "Count of total rental bikes") + theme_bw()

4.1.2) Linear regression of “cnt” (Count of total rental bikes) on “temp” (Normalized temperature in Celsius)

cnt_temp_lm <- lm(cnt~temp,data = bike_sharing)
summary(cnt_temp_lm)
## 
## Call:
## lm(formula = cnt ~ temp, data = bike_sharing)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4662  -1154    -89   1036   3675 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1339        159    8.43   <2e-16 ***
## temp            6526        300   21.78   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1470 on 708 degrees of freedom
## Multiple R-squared:  0.401,  Adjusted R-squared:   0.4 
## F-statistic:  474 on 1 and 708 DF,  p-value: <2e-16

4.1.3) Scatter plot of “temp” (Normalized temperature in Celsius) and “cnt” (Count of total rental bikes) with regression line

ggplot(data = bike_sharing,aes(y=cnt,x=temp)) + geom_point() + labs(x = "Normalized temperature in Celsius", y = "Count of total rental bikes") + theme_bw() + geom_smooth(method='lm', formula = y~x)

4.1.4) Residual Analysis

# Plotting squared residuals (Homogeneity Validation)
plot(y=cnt_temp_lm$residuals^2,x=as.numeric(names(cnt_temp_lm$residuals^2)),ylab="Squared Residuals",xlab="Index")

# qq-plot of residuals (Normality Validation)
qqnorm(rstandard(cnt_temp_lm), ylab="Standardized Residuals",xlab="Normal Scores")
qqline(rstandard(cnt_temp_lm))

# Histogram of residuals (Normality Validation)
hist(cnt_temp_lm$residuals,main = "Histogram of Residuals",xlab="",prob=T,ylim=c(0,0.0003))
x <- seq(min(cnt_temp_lm$residuals),max(cnt_temp_lm$residuals),length=40)
y <- dnorm(x,mean=mean(cnt_temp_lm$residuals),sd=sd(cnt_temp_lm$residuals))
lines(x, y, col="red", lwd=2)

4.2.1) Scatter plot of “hum” (Normalized humidity) and “cnt” (Count of total rental bikes)

ggplot(data = bike_sharing,aes(y=cnt,x=hum)) + geom_point() + labs(x = "Normalized humidity", y = "Count of total rental bikes") + theme_bw()

4.2.2) Linear regression of “cnt” (Count of total rental bikes) on “hum” (Normalized humidity)

cnt_hum_lm <- lm(cnt~hum,data = bike_sharing)
summary(cnt_hum_lm)
## 
## Call:
## lm(formula = cnt ~ hum, data = bike_sharing)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4110  -1330      5   1491   4052 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4989        335   14.88   <2e-16 ***
## hum             -651        527   -1.23     0.22    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1900 on 708 degrees of freedom
## Multiple R-squared:  0.00215,    Adjusted R-squared:  0.000738 
## F-statistic: 1.52 on 1 and 708 DF,  p-value: 0.217

4.2.3) Scatter plot of “hum” (Normalized humidity) and “cnt” (Count of total rental bikes) with regression line

ggplot(data = bike_sharing,aes(y=cnt,x=hum)) + geom_point() + labs(x = "Normalized humidity", y = "Count of total rental bikes") + theme_bw() + geom_smooth(method='lm', formula = y~x)

4.2.4) Residual Analysis

# Plot of squared residuals (Homogeneity Validation)
plot(y=cnt_hum_lm$residuals^2,x=as.numeric(names(cnt_hum_lm$residuals^2)),ylab="Squared Residuals",xlab="Index")

# Normal qq-plot (Normality Validation)
qqnorm(rstandard(cnt_hum_lm), ylab="Standardized Residuals",xlab="Normal Scores")
qqline(rstandard(cnt_hum_lm))

# Histogram of residual (Normality Validation)
hist(cnt_hum_lm$residuals,main = "Histogram of Residuals",xlab="",prob=T)
x <- seq(min(cnt_hum_lm$residuals),max(cnt_hum_lm$residuals),length=40)
y <- dnorm(x,mean=mean(cnt_hum_lm$residuals),sd=sd(cnt_hum_lm$residuals))
lines(x, y, col="red", lwd=2)

4.3.1) Scatter plot of “windspeed” (Wind speed) and “cnt” (Count of total rental bikes)

ggplot(data = bike_sharing,aes(y=cnt,x=windspeed)) + geom_point() + labs(x = "Normalized wind speed", y = "Count of total rental bikes") + theme_bw()

4.3.2) Linear regression of “cnt” (Count of total rental bikes) on “windspeed” (Wind speed)

cnt_windspeed_lm <- lm(cnt~windspeed,data = bike_sharing)
summary(cnt_windspeed_lm)
## 
## Call:
## lm(formula = cnt ~ windspeed, data = bike_sharing)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4538  -1314    -75   1437   4430 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     5554        185   29.97  < 2e-16 ***
## windspeed      -5130        909   -5.65  2.4e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1860 on 708 degrees of freedom
## Multiple R-squared:  0.0431, Adjusted R-squared:  0.0417 
## F-statistic: 31.9 on 1 and 708 DF,  p-value: 2.38e-08

4.3.3) Scatter plot of “windspeed” (Wind Speed) and “cnt” (Count of total rental bikes) with regression line

ggplot(data = bike_sharing,aes(y=cnt,x=windspeed)) + geom_point() + labs(x = "Normalized wind speed", y = "Count of total rental bikes") + theme_bw() + geom_smooth(method='lm', formula = y~x)

4.3.4) Residual Analysis

# Plot of squared residuals (Homogeneity Validation)
plot(y=cnt_windspeed_lm$residuals^2,x=as.numeric(names(cnt_windspeed_lm$residuals^2)),ylab="Squared Residuals",xlab="Index")

# Normal qq-plot (Normality Validation)
qqnorm(rstandard(cnt_windspeed_lm), ylab="Standardized Residuals",xlab="Normal Scores")
qqline(rstandard(cnt_windspeed_lm))

# Histogram of residual (Normality Validation)
hist(cnt_windspeed_lm$residuals,main = "Histogram of Residuals",xlab="",prob=T)
x <- seq(min(cnt_windspeed_lm$residuals),max(cnt_windspeed_lm$residuals),length=40)
y <- dnorm(x,mean=mean(cnt_windspeed_lm$residuals),sd=sd(cnt_windspeed_lm$residuals))
lines(x, y, col="red", lwd=2)

5) Multiple regression model

5.1) Multiple regression model of “cnt” (Count of total rental bikes) on [“temp” (Normalized temperature) + “hum” (Normalized humidity) + “windspeed” (Wind Speed) + “weathersit” (Weather situation)]

multi_reg <- lm(cnt~temp+hum+windspeed+weathersit,data = bike_sharing)
summary(multi_reg)
## 
## Call:
## lm(formula = cnt ~ temp + hum + windspeed + weathersit, data = bike_sharing)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4148  -1119    -92   1082   3618 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3810        353   10.78  < 2e-16 ***
## temp            6446        300   21.50  < 2e-16 ***
## hum            -2147        519   -4.14  4.0e-05 ***
## windspeed      -4046        738   -5.49  5.8e-08 ***
## weathersit      -247        140   -1.76    0.079 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1410 on 705 degrees of freedom
## Multiple R-squared:  0.45,   Adjusted R-squared:  0.447 
## F-statistic:  144 on 4 and 705 DF,  p-value: <2e-16

5.2) Residual Analysis

# Plot of squared residual (Homogeneity Validation)
plot(y=multi_reg$residuals^2,x=as.numeric(names(multi_reg$residuals^2)),ylab="Squared Residuals",xlab="Index")

# Normal qq-plot (Normality Validation)
qqnorm(rstandard(multi_reg), ylab="Standardized Residuals",xlab="Normal Scores")
qqline(rstandard(multi_reg))

# Histogram of residual (Normality Validation)
hist(multi_reg$residuals,main = "Histogram of Residuals",xlab="",prob=T,ylim=c(0,0.0003))
x <- seq(min(multi_reg$residuals),max(multi_reg$residuals),length=40)
y <- dnorm(x,mean=mean(multi_reg$residuals),sd=sd(multi_reg$residuals))
lines(x, y, col="red", lwd=2)