1) Loading libraries
suppressPackageStartupMessages(library(tidyverse)) # For data manipulation and analysis
suppressPackageStartupMessages(library(skimr)) # Data summary
2) Loading data
data <- suppressMessages(read_csv("updated_insurance.csv"))
3) Basic statistics
# Mean, standard deviation, minimum, maximum, range, quartiles, IQR, skewness and kurtosis
statistics <- function(x)
{
mean <- mean(x)
standard_deviation <- sd(x)
min <- min(x)
max <- max(x)
range <- max - min
first_quartile <- quantile(x)[2]
third_quartile <- quantile(x)[4]
IQR <- third_quartile - first_quartile
skewness <- mean((x-mean(x))^3)/(sd(x)^3)
kurtosis <- (mean((x-mean(x))^4))/sd(x)^4
return(cat("\t","Mean: ",mean,"\n\t","Standard deviation: ",standard_deviation,"\n\t","Minimum value: ",min,"\n\t","Maximum value: ",max,"\n\t","Range: ",range,"\n\t","First Quartile: ",first_quartile,"\n\t","Third Quartile: ",third_quartile,"\n\t","Interquartile Range: ",IQR,"\n\t","Skewness: ",skewness,"\n\t","Kurtosis: ",kurtosis))
}
# Age
statistics(data$age)
## Mean: 39.20703
## Standard deviation: 14.04996
## Minimum value: 18
## Maximum value: 64
## Range: 46
## First Quartile: 27
## Third Quartile: 51
## Interquartile Range: 24
## Skewness: 0.05554775
## Kurtosis: 1.752457
# BMI
statistics(data$bmi)
## Mean: 30.6634
## Standard deviation: 6.098187
## Minimum value: 15.96
## Maximum value: 53.13
## Range: 37.17
## First Quartile: 26.29625
## Third Quartile: 34.69375
## Interquartile Range: 8.3975
## Skewness: 0.2834106
## Kurtosis: 2.940576
# Charges
statistics(data$charges)
## Mean: 13270.42
## Standard deviation: 12110.01
## Minimum value: 1121.874
## Maximum value: 63770.43
## Range: 62648.55
## First Quartile: 4740.287
## Third Quartile: 16639.91
## Interquartile Range: 11899.63
## Skewness: 1.512483
## Kurtosis: 4.588954
4) Histogram
# Distribution of the "charges" variable
hist(data$charges,col="green",border="black",prob = TRUE,xlab = "",main = "Charges")

5) Data summary
skim(data)
Data summary
Name |
data |
Number of rows |
1338 |
Number of columns |
9 |
_______________________ |
|
Column type frequency: |
|
character |
5 |
numeric |
4 |
________________________ |
|
Group variables |
None |
Variable type: character
sex |
0 |
1 |
4 |
6 |
0 |
2 |
0 |
smoker |
0 |
1 |
2 |
3 |
0 |
2 |
0 |
region |
0 |
1 |
9 |
9 |
0 |
4 |
0 |
bmi_category |
0 |
1 |
5 |
9 |
0 |
2 |
0 |
age_category |
0 |
1 |
14 |
14 |
0 |
2 |
0 |
Variable type: numeric
age |
0 |
1 |
39.21 |
14.05 |
18.00 |
27.00 |
39.00 |
51.00 |
64.00 |
▇▅▅▆▆ |
bmi |
0 |
1 |
30.66 |
6.10 |
15.96 |
26.30 |
30.40 |
34.69 |
53.13 |
▂▇▇▂▁ |
children |
0 |
1 |
1.09 |
1.21 |
0.00 |
0.00 |
1.00 |
2.00 |
5.00 |
▇▂▂▁▁ |
charges |
0 |
1 |
13270.42 |
12110.01 |
1121.87 |
4740.29 |
9382.03 |
16639.91 |
63770.43 |
▇▂▁▁▁ |
6) Correlation
# Correlation between all continuous variables
cor(data[,c("age","bmi","charges")])
## age bmi charges
## age 1.0000000 0.1092719 0.2990082
## bmi 0.1092719 1.0000000 0.1983410
## charges 0.2990082 0.1983410 1.0000000
7) Scatter plot
# Spoken Tutorials:
# 17. Introduction to ggplot2
# 18. Aesthetic Mapping in ggplot2
# 19. Data Manipulation using dplyr Package
# 20. More Functions in dplyr Package
# 21. Pipe Operator
# BMI and charges
# Scatter plot
ggplot(data = data,aes(y=charges,x=bmi)) + geom_point() + labs(x = "BMI", y = "Charges") + theme_bw()

# Age and charges
# Scatter plot
ggplot(data = data,aes(y=charges,x=age)) + geom_point() + labs(x = "Age", y = "Charges") + theme_bw()

# Significant variables
# 1) age_category
# 2) smoker
# 3) bmi_category
# Finding patterns using single discrete variable
# Finding patterns using "age_category" variable
ggplot(data = data,aes(y=charges,x=age)) + geom_point(aes(color=age_category)) + labs(x = "Age", y = "Charges") + theme_bw()

# Finding patterns using "smoker" variable
ggplot(data = data,aes(y=charges,x=age)) + geom_point(aes(color=smoker)) + labs(x = "Age", y = "Charges") + theme_bw()

# Finding patterns using "bmi_category" variable
ggplot(data = data,aes(y=charges,x=age)) + geom_point(aes(color=bmi_category)) + labs(x = "Age", y = "Charges") + theme_bw()

# Finding patterns using multiple discrete variables
# Finding patterns using "smoker" and "bmi_category" variables
ggplot(data = data %>% filter(bmi_category=="Obese"),aes(y=charges,x=age)) + geom_point(aes(color=smoker)) + labs(x = "Age", y = "Charges") + theme_bw()

ggplot(data = data %>% filter(bmi_category=="Non-obese"),aes(y=charges,x=age)) + geom_point(aes(color=smoker)) + labs(x = "Age", y = "Charges") + theme_bw()

# Four categories are discovered
# 1) Smoker and obese
# 2) Non-smoker and obese
# 3) Smoker and non-obese
# 4) Non-smoker and non-obese
8) Scatter plot and regression
# 1) Smoker and obese category
smoker_obese <- data %>% filter(smoker=="yes"&bmi_category=="Obese")
# Regression
smoker_obese_lm <- lm(charges~age,data = smoker_obese)
summary(smoker_obese_lm)
##
## Call:
## lm(formula = charges ~ age, data = smoker_obese)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20222.5 -2204.8 -1071.4 998.7 19382.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30558.13 1093.04 27.96 <2e-16 ***
## age 281.15 26.25 10.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4508 on 143 degrees of freedom
## Multiple R-squared: 0.4452, Adjusted R-squared: 0.4413
## F-statistic: 114.7 on 1 and 143 DF, p-value: < 2.2e-16
# Scatter plot with regression line
ggplot(data = smoker_obese,aes(y=charges,x=age)) + geom_point() + labs(x = "Age", y = "Charges") + theme_bw() + geom_smooth(method='lm', formula = y~x)

# Residuals plot
qqnorm(rstandard(smoker_obese_lm), ylab="Standardized Residuals",xlab="Normal Scores")
qqline(rstandard(smoker_obese_lm))

# Squared residuals plot
plot(y=smoker_obese_lm$residuals^2,x=as.numeric(names(smoker_obese_lm$residuals^2)),ylab="Squared Residuals",xlab="Index")

# Histogram of residuals
hist(smoker_obese_lm$residuals,main = "Histogram of Residuals",xlab="",prob=T)

# 2) Non-smoker and obese category
non_smoker_obese <- data %>% filter(smoker=="no"&bmi_category=="Obese")
# Regression
non_smoker_obese_lm <- lm(charges~age,data = non_smoker_obese)
summary(non_smoker_obese_lm)
##
## Call:
## lm(formula = charges ~ age, data = non_smoker_obese)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3234.2 -2017.5 -1435.5 -650.9 24405.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2029.05 598.59 -3.39 0.000749 ***
## age 267.39 13.88 19.27 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4738 on 560 degrees of freedom
## Multiple R-squared: 0.3987, Adjusted R-squared: 0.3976
## F-statistic: 371.3 on 1 and 560 DF, p-value: < 2.2e-16
# Scatter plot with regression line
ggplot(data = non_smoker_obese,aes(y=charges,x=age)) + geom_point() + labs(x = "Age", y = "Charges") + theme_bw() + geom_smooth(method='lm', formula = y~x)

# Residuals plot
qqnorm(rstandard(non_smoker_obese_lm), ylab="Standardized Residuals",xlab="Normal Scores")
qqline(rstandard(smoker_obese_lm))

# Squared residuals plot
plot(y=non_smoker_obese_lm$residuals^2,x=as.numeric(names(non_smoker_obese_lm$residuals^2)),ylab="Squared Residuals",xlab="Index")

# Histogram of residuals
hist(non_smoker_obese_lm$residuals,main = "Histogram of Residuals",xlab="",prob=T)

# 3) Smoker and non-obese category
smoker_non_obese <- data %>% filter(smoker=="yes"&bmi_category=="Non-obese")
# Regression
smoker_non_obese_lm <- lm(charges~age,data = smoker_non_obese)
summary(smoker_non_obese_lm)
##
## Call:
## lm(formula = charges ~ age, data = smoker_non_obese)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5587.0 -1960.8 -445.8 782.3 17388.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11503.36 962.92 11.95 <2e-16 ***
## age 260.64 23.99 10.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3662 on 127 degrees of freedom
## Multiple R-squared: 0.4818, Adjusted R-squared: 0.4777
## F-statistic: 118.1 on 1 and 127 DF, p-value: < 2.2e-16
# Scatter plot with regression line
ggplot(data = smoker_non_obese,aes(y=charges,x=age)) + geom_point() + labs(x = "Age", y = "Charges") + theme_bw() + geom_smooth(method='lm', formula = y~x)

# Residuals plot
qqnorm(rstandard(smoker_non_obese_lm), ylab="Standardized Residuals",xlab="Normal Scores")
qqline(rstandard(smoker_non_obese_lm))

# Squared residuals plot
plot(y=smoker_non_obese_lm$residuals^2,x=as.numeric(names(smoker_non_obese_lm$residuals^2)),ylab="Squared Residuals",xlab="Index")

# Histogram of residuals
hist(smoker_non_obese_lm$residuals,main = "Histogram of Residuals",xlab="",prob=T)

# 4) Non-smoker and non-obese category
non_smoker_non_obese <- data %>% filter(smoker=="no"&bmi_category=="Non-obese")
# Regression
non_smoker_non_obese_lm <- lm(charges~age,data = non_smoker_non_obese)
summary(non_smoker_non_obese_lm)
##
## Call:
## lm(formula = charges ~ age, data = non_smoker_non_obese)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3103.9 -1873.1 -1303.2 -660.2 22651.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2118.37 609.43 -3.476 0.000553 ***
## age 265.95 15.12 17.591 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4594 on 500 degrees of freedom
## Multiple R-squared: 0.3823, Adjusted R-squared: 0.3811
## F-statistic: 309.4 on 1 and 500 DF, p-value: < 2.2e-16
# Scatter plot with regression line
ggplot(data = non_smoker_non_obese,aes(y=charges,x=age)) + geom_point() + labs(x = "Age", y = "Charges") + theme_bw() + geom_smooth(method='lm', formula = y~x)

# Residuals plot
qqnorm(rstandard(non_smoker_non_obese_lm), ylab="Standardized Residuals",xlab="Normal Scores")
qqline(rstandard(non_smoker_non_obese_lm))

# Squared residuals plot
plot(y=non_smoker_non_obese_lm$residuals^2,x=as.numeric(names(non_smoker_non_obese_lm$residuals^2)),ylab="Squared Residuals",xlab="Index")

# Histogram of residuals
hist(non_smoker_non_obese_lm$residuals,main = "Histogram of Residuals",xlab="",prob=T)
