1) Basic statistics of continuous variable
# Mean, standard deviation, minimum, maximum, range, quartiles, IQR, skewness and kurtosis
statistics <- function(x)
{
mean <- mean(x)
standard_deviation <- sd(x)
min <- min(x)
max <- max(x)
range <- max - min
first_quartile <- quantile(x)[2]
third_quartile <- quantile(x)[4]
IQR <- third_quartile - first_quartile
skewness <- mean((x-mean(x))^3)/(sd(x)^3)
kurtosis <- (mean((x-mean(x))^4))/sd(x)^4
return(cat("\t","Mean: ",mean,"\n\t","Standard deviation: ",standard_deviation,"\n\t","Minimum value: ",min,"\n\t","Maximum value: ",max,"\n\t","Range: ",range,"\n\t","First Quartile: ",first_quartile,"\n\t","Third Quartile: ",third_quartile,"\n\t","Interquartile Range: ",IQR,"\n\t","Skewness: ",skewness,"\n\t","Kurtosis: ",kurtosis))
}
1.1) Basic statistics of the variable “temp” (Normalized temperature in Celsius)
statistics(bike_sharing$temp)
## Mean: 0.497
## Standard deviation: 0.184
## Minimum value: 0.0591
## Maximum value: 0.862
## Range: 0.803
## First Quartile: 0.337
## Third Quartile: 0.657
## Interquartile Range: 0.321
## Skewness: -0.0665
## Kurtosis: 1.86
1.2) Basic statistics of the variable “hum” (Normalized humidity)
statistics(bike_sharing$hum)
## Mean: 0.621
## Standard deviation: 0.135
## Minimum value: 0.188
## Maximum value: 0.973
## Range: 0.785
## First Quartile: 0.518
## Third Quartile: 0.72
## Interquartile Range: 0.202
## Skewness: -0.0296
## Kurtosis: 2.54
1.3) Basic statistics of the variable “windspeed” (Normalized wind speed)
statistics(bike_sharing$windspeed)
## Mean: 0.189
## Standard deviation: 0.0767
## Minimum value: 0.0224
## Maximum value: 0.507
## Range: 0.485
## First Quartile: 0.134
## Third Quartile: 0.231
## Interquartile Range: 0.0966
## Skewness: 0.705
## Kurtosis: 3.53
1.4) Basic statistics of the variable “cnt” (Count of total rental bikes)
statistics(bike_sharing$cnt)
## Mean: 4584
## Standard deviation: 1897
## Minimum value: 431
## Maximum value: 8714
## Range: 8283
## First Quartile: 3296
## Third Quartile: 6039
## Interquartile Range: 2743
## Skewness: -0.0468
## Kurtosis: 2.19
4) Scatter plot and linear regression
4.1.1) Scatter plot between “temp” (Normalized temperature in Celsius) and “cnt” (Count of total rental bikes)
# Spoken Tutorials:
# 17. Introduction to ggplot2
# 18. Aesthetic Mapping in ggplot2
# "cnt" and "temp" variables
# Scatter plot
ggplot(data = bike_sharing,aes(y=cnt,x=temp)) + geom_point() + labs(x = "Normalized temperature in Celsius", y = "Count of total rental bikes") + theme_bw()

4.1.2) Linear regression of “cnt” (Count of total rental bikes) on “temp” (Normalized temperature in Celsius)
cnt_temp_lm <- lm(cnt~temp,data = bike_sharing)
summary(cnt_temp_lm)
##
## Call:
## lm(formula = cnt ~ temp, data = bike_sharing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4662 -1154 -89 1036 3675
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1339 159 8.43 <2e-16 ***
## temp 6526 300 21.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1470 on 708 degrees of freedom
## Multiple R-squared: 0.401, Adjusted R-squared: 0.4
## F-statistic: 474 on 1 and 708 DF, p-value: <2e-16
4.1.3) Scatter plot of “temp” (Normalized temperature in Celsius) and “cnt” (Count of total rental bikes) with regression line
ggplot(data = bike_sharing,aes(y=cnt,x=temp)) + geom_point() + labs(x = "Normalized temperature in Celsius", y = "Count of total rental bikes") + theme_bw() + geom_smooth(method='lm', formula = y~x)

4.1.4) Residual Analysis
# Plotting squared residuals (Homogeneity Validation)
plot(y=cnt_temp_lm$residuals^2,x=as.numeric(names(cnt_temp_lm$residuals^2)),ylab="Squared Residuals",xlab="Index")

# qq-plot of residuals (Normality Validation)
qqnorm(rstandard(cnt_temp_lm), ylab="Standardized Residuals",xlab="Normal Scores")
qqline(rstandard(cnt_temp_lm))

# Histogram of residuals (Normality Validation)
hist(cnt_temp_lm$residuals,main = "Histogram of Residuals",xlab="",prob=T,ylim=c(0,0.0003))
x <- seq(min(cnt_temp_lm$residuals),max(cnt_temp_lm$residuals),length=40)
y <- dnorm(x,mean=mean(cnt_temp_lm$residuals),sd=sd(cnt_temp_lm$residuals))
lines(x, y, col="red", lwd=2)

4.2.1) Scatter plot of “hum” (Normalized humidity) and “cnt” (Count of total rental bikes)
ggplot(data = bike_sharing,aes(y=cnt,x=hum)) + geom_point() + labs(x = "Normalized humidity", y = "Count of total rental bikes") + theme_bw()

4.2.2) Linear regression of “cnt” (Count of total rental bikes) on “hum” (Normalized humidity)
cnt_hum_lm <- lm(cnt~hum,data = bike_sharing)
summary(cnt_hum_lm)
##
## Call:
## lm(formula = cnt ~ hum, data = bike_sharing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4110 -1330 5 1491 4052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4989 335 14.88 <2e-16 ***
## hum -651 527 -1.23 0.22
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1900 on 708 degrees of freedom
## Multiple R-squared: 0.00215, Adjusted R-squared: 0.000738
## F-statistic: 1.52 on 1 and 708 DF, p-value: 0.217
4.2.3) Scatter plot of “hum” (Normalized humidity) and “cnt” (Count of total rental bikes) with regression line
ggplot(data = bike_sharing,aes(y=cnt,x=hum)) + geom_point() + labs(x = "Normalized humidity", y = "Count of total rental bikes") + theme_bw() + geom_smooth(method='lm', formula = y~x)

4.2.4) Residual Analysis
# Plot of squared residuals (Homogeneity Validation)
plot(y=cnt_hum_lm$residuals^2,x=as.numeric(names(cnt_hum_lm$residuals^2)),ylab="Squared Residuals",xlab="Index")

# Normal qq-plot (Normality Validation)
qqnorm(rstandard(cnt_hum_lm), ylab="Standardized Residuals",xlab="Normal Scores")
qqline(rstandard(cnt_hum_lm))

# Histogram of residual (Normality Validation)
hist(cnt_hum_lm$residuals,main = "Histogram of Residuals",xlab="",prob=T)
x <- seq(min(cnt_hum_lm$residuals),max(cnt_hum_lm$residuals),length=40)
y <- dnorm(x,mean=mean(cnt_hum_lm$residuals),sd=sd(cnt_hum_lm$residuals))
lines(x, y, col="red", lwd=2)

4.3.1) Scatter plot of “windspeed” (Wind speed) and “cnt” (Count of total rental bikes)
ggplot(data = bike_sharing,aes(y=cnt,x=windspeed)) + geom_point() + labs(x = "Normalized wind speed", y = "Count of total rental bikes") + theme_bw()

4.3.2) Linear regression of “cnt” (Count of total rental bikes) on “windspeed” (Wind speed)
cnt_windspeed_lm <- lm(cnt~windspeed,data = bike_sharing)
summary(cnt_windspeed_lm)
##
## Call:
## lm(formula = cnt ~ windspeed, data = bike_sharing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4538 -1314 -75 1437 4430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5554 185 29.97 < 2e-16 ***
## windspeed -5130 909 -5.65 2.4e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1860 on 708 degrees of freedom
## Multiple R-squared: 0.0431, Adjusted R-squared: 0.0417
## F-statistic: 31.9 on 1 and 708 DF, p-value: 2.38e-08
4.3.3) Scatter plot of “windspeed” (Wind Speed) and “cnt” (Count of total rental bikes) with regression line
ggplot(data = bike_sharing,aes(y=cnt,x=windspeed)) + geom_point() + labs(x = "Normalized wind speed", y = "Count of total rental bikes") + theme_bw() + geom_smooth(method='lm', formula = y~x)

4.3.4) Residual Analysis
# Plot of squared residuals (Homogeneity Validation)
plot(y=cnt_windspeed_lm$residuals^2,x=as.numeric(names(cnt_windspeed_lm$residuals^2)),ylab="Squared Residuals",xlab="Index")

# Normal qq-plot (Normality Validation)
qqnorm(rstandard(cnt_windspeed_lm), ylab="Standardized Residuals",xlab="Normal Scores")
qqline(rstandard(cnt_windspeed_lm))

# Histogram of residual (Normality Validation)
hist(cnt_windspeed_lm$residuals,main = "Histogram of Residuals",xlab="",prob=T)
x <- seq(min(cnt_windspeed_lm$residuals),max(cnt_windspeed_lm$residuals),length=40)
y <- dnorm(x,mean=mean(cnt_windspeed_lm$residuals),sd=sd(cnt_windspeed_lm$residuals))
lines(x, y, col="red", lwd=2)

5) Multiple regression model
5.1) Multiple regression model of “cnt” (Count of total rental bikes) on [“temp” (Normalized temperature) + “hum” (Normalized humidity) + “windspeed” (Wind Speed) + “weathersit” (Weather situation)]
multi_reg <- lm(cnt~temp+hum+windspeed+weathersit,data = bike_sharing)
summary(multi_reg)
##
## Call:
## lm(formula = cnt ~ temp + hum + windspeed + weathersit, data = bike_sharing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4148 -1119 -92 1082 3618
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3810 353 10.78 < 2e-16 ***
## temp 6446 300 21.50 < 2e-16 ***
## hum -2147 519 -4.14 4.0e-05 ***
## windspeed -4046 738 -5.49 5.8e-08 ***
## weathersit -247 140 -1.76 0.079 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1410 on 705 degrees of freedom
## Multiple R-squared: 0.45, Adjusted R-squared: 0.447
## F-statistic: 144 on 4 and 705 DF, p-value: <2e-16
5.2) Residual Analysis
# Plot of squared residual (Homogeneity Validation)
plot(y=multi_reg$residuals^2,x=as.numeric(names(multi_reg$residuals^2)),ylab="Squared Residuals",xlab="Index")

# Normal qq-plot (Normality Validation)
qqnorm(rstandard(multi_reg), ylab="Standardized Residuals",xlab="Normal Scores")
qqline(rstandard(multi_reg))

# Histogram of residual (Normality Validation)
hist(multi_reg$residuals,main = "Histogram of Residuals",xlab="",prob=T,ylim=c(0,0.0003))
x <- seq(min(multi_reg$residuals),max(multi_reg$residuals),length=40)
y <- dnorm(x,mean=mean(multi_reg$residuals),sd=sd(multi_reg$residuals))
lines(x, y, col="red", lwd=2)
