1) Loading libraries

suppressPackageStartupMessages(library(tidyverse)) # For data manipulation and analysis
suppressPackageStartupMessages(library(skimr)) # Data summary

2) Loading data

data <- suppressMessages(read_csv("updated_insurance.csv"))

3) Basic statistics

# Mean, standard deviation, minimum, maximum, range, quartiles, IQR, skewness and kurtosis
statistics <- function(x)
{
  mean <- mean(x)
  standard_deviation <- sd(x)
  min <- min(x)
  max <- max(x)
  range <- max - min
  first_quartile <- quantile(x)[2]
  third_quartile <- quantile(x)[4]
  IQR <- third_quartile - first_quartile
  skewness <- mean((x-mean(x))^3)/(sd(x)^3)
  kurtosis <- (mean((x-mean(x))^4))/sd(x)^4
  
  return(cat("\t","Mean: ",mean,"\n\t","Standard deviation: ",standard_deviation,"\n\t","Minimum value: ",min,"\n\t","Maximum value: ",max,"\n\t","Range: ",range,"\n\t","First Quartile: ",first_quartile,"\n\t","Third Quartile: ",third_quartile,"\n\t","Interquartile Range: ",IQR,"\n\t","Skewness: ",skewness,"\n\t","Kurtosis: ",kurtosis))
}
# Age
statistics(data$age)
##   Mean:  39.20703 
##   Standard deviation:  14.04996 
##   Minimum value:  18 
##   Maximum value:  64 
##   Range:  46 
##   First Quartile:  27 
##   Third Quartile:  51 
##   Interquartile Range:  24 
##   Skewness:  0.05554775 
##   Kurtosis:  1.752457
# BMI
statistics(data$bmi)
##   Mean:  30.6634 
##   Standard deviation:  6.098187 
##   Minimum value:  15.96 
##   Maximum value:  53.13 
##   Range:  37.17 
##   First Quartile:  26.29625 
##   Third Quartile:  34.69375 
##   Interquartile Range:  8.3975 
##   Skewness:  0.2834106 
##   Kurtosis:  2.940576
# Charges
statistics(data$charges)
##   Mean:  13270.42 
##   Standard deviation:  12110.01 
##   Minimum value:  1121.874 
##   Maximum value:  63770.43 
##   Range:  62648.55 
##   First Quartile:  4740.287 
##   Third Quartile:  16639.91 
##   Interquartile Range:  11899.63 
##   Skewness:  1.512483 
##   Kurtosis:  4.588954

4) Histogram

# Distribution of the "charges" variable
hist(data$charges,col="green",border="black",prob = TRUE,xlab = "",main = "Charges")

5) Data summary

skim(data)
Data summary
Name data
Number of rows 1338
Number of columns 9
_______________________
Column type frequency:
character 5
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
sex 0 1 4 6 0 2 0
smoker 0 1 2 3 0 2 0
region 0 1 9 9 0 4 0
bmi_category 0 1 5 9 0 2 0
age_category 0 1 14 14 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 39.21 14.05 18.00 27.00 39.00 51.00 64.00 ▇▅▅▆▆
bmi 0 1 30.66 6.10 15.96 26.30 30.40 34.69 53.13 ▂▇▇▂▁
children 0 1 1.09 1.21 0.00 0.00 1.00 2.00 5.00 ▇▂▂▁▁
charges 0 1 13270.42 12110.01 1121.87 4740.29 9382.03 16639.91 63770.43 ▇▂▁▁▁

6) Correlation

# Correlation between all continuous variables
cor(data[,c("age","bmi","charges")])
##               age       bmi   charges
## age     1.0000000 0.1092719 0.2990082
## bmi     0.1092719 1.0000000 0.1983410
## charges 0.2990082 0.1983410 1.0000000

7) Scatter plot

# Spoken Tutorials:
# 17. Introduction to ggplot2
# 18. Aesthetic Mapping in ggplot2
# 19. Data Manipulation using dplyr Package
# 20. More Functions in dplyr Package
# 21. Pipe Operator

# BMI and charges
# Scatter plot
ggplot(data = data,aes(y=charges,x=bmi)) + geom_point() + labs(x = "BMI", y = "Charges") + theme_bw()

# Age and charges
# Scatter plot
ggplot(data = data,aes(y=charges,x=age)) + geom_point() + labs(x = "Age", y = "Charges") + theme_bw()

# Significant variables
# 1) age_category
# 2) smoker
# 3) bmi_category

# Finding patterns using single discrete variable
# Finding patterns using "age_category" variable
ggplot(data = data,aes(y=charges,x=age)) + geom_point(aes(color=age_category)) + labs(x = "Age", y = "Charges") + theme_bw()

# Finding patterns using "smoker" variable
ggplot(data = data,aes(y=charges,x=age)) + geom_point(aes(color=smoker)) + labs(x = "Age", y = "Charges") + theme_bw()

# Finding patterns using "bmi_category" variable
ggplot(data = data,aes(y=charges,x=age)) + geom_point(aes(color=bmi_category)) + labs(x = "Age", y = "Charges") + theme_bw()

# Finding patterns using multiple discrete variables
# Finding patterns using "smoker" and "bmi_category" variables
ggplot(data = data %>% filter(bmi_category=="Obese"),aes(y=charges,x=age)) + geom_point(aes(color=smoker)) + labs(x = "Age", y = "Charges") + theme_bw()

ggplot(data = data %>% filter(bmi_category=="Non-obese"),aes(y=charges,x=age)) + geom_point(aes(color=smoker)) + labs(x = "Age", y = "Charges") + theme_bw()

# Four categories are discovered
# 1) Smoker and obese
# 2) Non-smoker and obese
# 3) Smoker and non-obese
# 4) Non-smoker and non-obese

8) Scatter plot and regression

# 1) Smoker and obese category
smoker_obese <- data %>% filter(smoker=="yes"&bmi_category=="Obese")
# Regression
smoker_obese_lm <- lm(charges~age,data = smoker_obese)
summary(smoker_obese_lm)
## 
## Call:
## lm(formula = charges ~ age, data = smoker_obese)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20222.5  -2204.8  -1071.4    998.7  19382.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30558.13    1093.04   27.96   <2e-16 ***
## age           281.15      26.25   10.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4508 on 143 degrees of freedom
## Multiple R-squared:  0.4452, Adjusted R-squared:  0.4413 
## F-statistic: 114.7 on 1 and 143 DF,  p-value: < 2.2e-16
# Scatter plot with regression line
ggplot(data = smoker_obese,aes(y=charges,x=age)) + geom_point() + labs(x = "Age", y = "Charges") + theme_bw() + geom_smooth(method='lm', formula = y~x)

# Residuals plot
qqnorm(rstandard(smoker_obese_lm), ylab="Standardized Residuals",xlab="Normal Scores")
qqline(rstandard(smoker_obese_lm))

# Squared residuals plot
plot(y=smoker_obese_lm$residuals^2,x=as.numeric(names(smoker_obese_lm$residuals^2)),ylab="Squared Residuals",xlab="Index")

# Histogram of residuals
hist(smoker_obese_lm$residuals,main = "Histogram of Residuals",xlab="",prob=T)

# 2) Non-smoker and obese category
non_smoker_obese <- data %>% filter(smoker=="no"&bmi_category=="Obese")
# Regression
non_smoker_obese_lm <- lm(charges~age,data = non_smoker_obese)
summary(non_smoker_obese_lm)
## 
## Call:
## lm(formula = charges ~ age, data = non_smoker_obese)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3234.2 -2017.5 -1435.5  -650.9 24405.5 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2029.05     598.59   -3.39 0.000749 ***
## age           267.39      13.88   19.27  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4738 on 560 degrees of freedom
## Multiple R-squared:  0.3987, Adjusted R-squared:  0.3976 
## F-statistic: 371.3 on 1 and 560 DF,  p-value: < 2.2e-16
# Scatter plot with regression line
ggplot(data = non_smoker_obese,aes(y=charges,x=age)) + geom_point() + labs(x = "Age", y = "Charges") + theme_bw() + geom_smooth(method='lm', formula = y~x)

# Residuals plot
qqnorm(rstandard(non_smoker_obese_lm), ylab="Standardized Residuals",xlab="Normal Scores")
qqline(rstandard(smoker_obese_lm))

# Squared residuals plot
plot(y=non_smoker_obese_lm$residuals^2,x=as.numeric(names(non_smoker_obese_lm$residuals^2)),ylab="Squared Residuals",xlab="Index")

# Histogram of residuals
hist(non_smoker_obese_lm$residuals,main = "Histogram of Residuals",xlab="",prob=T)

# 3) Smoker and non-obese category
smoker_non_obese <- data %>% filter(smoker=="yes"&bmi_category=="Non-obese")
# Regression
smoker_non_obese_lm <- lm(charges~age,data = smoker_non_obese)
summary(smoker_non_obese_lm)
## 
## Call:
## lm(formula = charges ~ age, data = smoker_non_obese)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5587.0 -1960.8  -445.8   782.3 17388.8 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11503.36     962.92   11.95   <2e-16 ***
## age           260.64      23.99   10.87   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3662 on 127 degrees of freedom
## Multiple R-squared:  0.4818, Adjusted R-squared:  0.4777 
## F-statistic: 118.1 on 1 and 127 DF,  p-value: < 2.2e-16
# Scatter plot with regression line
ggplot(data = smoker_non_obese,aes(y=charges,x=age)) + geom_point() + labs(x = "Age", y = "Charges") + theme_bw() + geom_smooth(method='lm', formula = y~x)

# Residuals plot
qqnorm(rstandard(smoker_non_obese_lm), ylab="Standardized Residuals",xlab="Normal Scores")
qqline(rstandard(smoker_non_obese_lm))

# Squared residuals plot
plot(y=smoker_non_obese_lm$residuals^2,x=as.numeric(names(smoker_non_obese_lm$residuals^2)),ylab="Squared Residuals",xlab="Index")

# Histogram of residuals
hist(smoker_non_obese_lm$residuals,main = "Histogram of Residuals",xlab="",prob=T)

# 4) Non-smoker and non-obese category
non_smoker_non_obese <- data %>% filter(smoker=="no"&bmi_category=="Non-obese")
# Regression
non_smoker_non_obese_lm <- lm(charges~age,data = non_smoker_non_obese)
summary(non_smoker_non_obese_lm)
## 
## Call:
## lm(formula = charges ~ age, data = non_smoker_non_obese)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3103.9 -1873.1 -1303.2  -660.2 22651.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2118.37     609.43  -3.476 0.000553 ***
## age           265.95      15.12  17.591  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4594 on 500 degrees of freedom
## Multiple R-squared:  0.3823, Adjusted R-squared:  0.3811 
## F-statistic: 309.4 on 1 and 500 DF,  p-value: < 2.2e-16
# Scatter plot with regression line
ggplot(data = non_smoker_non_obese,aes(y=charges,x=age)) + geom_point() + labs(x = "Age", y = "Charges") + theme_bw() + geom_smooth(method='lm', formula = y~x)

# Residuals plot
qqnorm(rstandard(non_smoker_non_obese_lm), ylab="Standardized Residuals",xlab="Normal Scores")
qqline(rstandard(non_smoker_non_obese_lm))

# Squared residuals plot
plot(y=non_smoker_non_obese_lm$residuals^2,x=as.numeric(names(non_smoker_non_obese_lm$residuals^2)),ylab="Squared Residuals",xlab="Index")

# Histogram of residuals
hist(non_smoker_non_obese_lm$residuals,main = "Histogram of Residuals",xlab="",prob=T)