1) Loading libraries

suppressPackageStartupMessages(library(tidyverse)) # For data manipulation and analysis
suppressPackageStartupMessages(library(lessR)) # For creating pivot tables

2) Loading data

# Spoken Tutorials:
# 5. Introduction to Data Frames in R

# Source: https://github.com/PacktPublishing/Machine-Learning-with-R-Second-Edition/blob/master/Chapter%2006/insurance.csv
data <- read.csv("insurance.csv")

3) Discrete data analysis

3.1) Univariate frequency/proportion distribution, bar plot and pie chart

# Spoken Tutorials:
# 9. Indexing and Slicing Data Frames
# 15. Plotting Histograms and Pie Chart 
# 16. Plotting Bar Charts and Scatter Plot

# Color source: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf

# 3.1.1) Gender
# Univariate frequency/proportion distribution
prop.table(table(data$sex))
## 
##    female      male 
## 0.4947683 0.5052317
# Bar plot
barplot(height = prop.table(table(data$sex)), col = c("green","blue"), main = "Gender", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(data$sex)), labels = names(prop.table(table(data$sex))), col = c("green","blue"), radius = 1, main = "Gender")

# 3.1.2) Children
# Univariate frequency/proportion distribution
prop.table(table(data$children))
## 
##          0          1          2          3          4          5 
## 0.42899851 0.24215247 0.17937220 0.11733931 0.01868460 0.01345291
# Bar plot
barplot(height = prop.table(table(data$children)), col = c("darkolivegreen1","green","limegreen","lawngreen","darkseagreen4","forestgreen"), main = "Children", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(data$children)), labels = names(prop.table(table(data$children))), col = c("darkolivegreen1","green","limegreen","lawngreen","darkseagreen4","forestgreen"), radius = 1, main = "Children")

# 3.1.3) Smoker
# Univariate frequency/proportion distribution
prop.table(table(data$smoker))
## 
##        no       yes 
## 0.7952167 0.2047833
# Bar plot
barplot(height = prop.table(table(data$smoker)), col = c("lightcyan","grey"), main = "Smoker", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(data$smoker)), labels = names(prop.table(table(data$smoker))), col = c("lightcyan","grey"), radius = 1, main = "Smoker")

# 3.1.4) Region
# Univariate frequency/proportion distribution
prop.table(table(data$region))
## 
## northeast northwest southeast southwest 
## 0.2421525 0.2428999 0.2720478 0.2428999
# Bar plot
barplot(height = prop.table(table(data$region)), col = c("blue","slateblue1","steelblue2"), main = "Region", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(data$region)), labels = names(prop.table(table(data$region))), col = c("blue","slateblue1","steelblue2","darkviolet"), radius = 1, main = "Region")

3.2) Bivariate distribution and grouped bar plot

# Spoken Tutorials:
# 23. Functions in R

# Function for cross tabulation (proportion)
cross_tab_proportion <- function(x,y)
{
  temp_table <- prop.table(table(x,y))
  temp_col <- colSums(temp_table)
  temp_row <- rowSums(temp_table)
  temp_sum <- sum(temp_table)
  temp <- as.data.frame(rbind(cbind(temp_table,temp_row),c(temp_col,temp_sum)))
  rownames(temp)[nrow(temp)] <- "total"
  colnames(temp)[ncol(temp)] <- "total"
  return(temp)
}

# 3.2.1) Gender and smoker
# Cross tabulation
cross_tab_proportion(data$sex,data$smoker)
##               no        yes     total
## female 0.4088191 0.08594918 0.4947683
## male   0.3863976 0.11883408 0.5052317
## total  0.7952167 0.20478326 1.0000000
# Grouped bar plot
table_data <- prop.table(table(data$sex,data$smoker))
barplot(table_data, main = "Gender", xlab = "", ylab = "Proportion",
        col = c("blue", "red", "blue", "red"), beside = TRUE)
legend("topright", rownames(table_data), fill = c("blue", "red", "blue", "red"), xpd=TRUE, inset=c(0, -.1), horiz = T, bty = "n")

# 3.2.2) Children and smoker
# Cross tabulation
cross_tab_proportion(data$children,data$smoker)
##               no          yes      total
## 0     0.34304933 0.0859491779 0.42899851
## 1     0.19656203 0.0455904335 0.24215247
## 2     0.13826607 0.0411061286 0.17937220
## 3     0.08819133 0.0291479821 0.11733931
## 4     0.01644245 0.0022421525 0.01868460
## 5     0.01270553 0.0007473842 0.01345291
## total 0.79521674 0.2047832586 1.00000000
# Grouped bar plot
table_data <- prop.table(table(data$children,data$smoker))
barplot(table_data, main = "Children", xlab = "", ylab = "Proportion",
        col = c("dodgerblue","blue","slateblue1","steelblue2","darkviolet","deepskyblue4"), beside = TRUE)
legend("topright", rownames(table_data), fill = c("dodgerblue","blue","slateblue1","steelblue2","darkviolet","deepskyblue4"), xpd=TRUE, inset=c(0, -.1), horiz = T, bty = "n")

# 3.2.3) Region and smoker
# Cross tabulation
cross_tab_proportion(data$region,data$smoker)
##                  no        yes     total
## northeast 0.1920777 0.05007474 0.2421525
## northwest 0.1995516 0.04334828 0.2428999
## southeast 0.2040359 0.06801196 0.2720478
## southwest 0.1995516 0.04334828 0.2428999
## total     0.7952167 0.20478326 1.0000000
# Grouped bar plot
table_data <- prop.table(table(data$region,data$smoker))
barplot(table_data, main = "Region", xlab = "", ylab = "Proportion",
        col = c("darkorange4","peru","tan1","sandybrown"), beside = TRUE)
legend("topright", rownames(table_data), fill = c("darkorange4","peru","tan1","sandybrown"), xpd=TRUE, inset=c(0, -.3), ncol = 2, bty = "n")

3.3) Stacked bar plot

# Spoken Tutorials:
# 19. Data Manipulation using dplyr Package
# 20. More Functions in dplyr Package
# 21. Pipe Operator

# 3.3.1) Gender and smoker
male <- data %>% filter(sex=="male")
female <- data %>% filter(sex=="female")
stack_data <- prop.table(table(data$sex,data$smoker))
rownames(stack_data) <- c("male","female")
barplot(stack_data, col=c("red","green"), main = "Gender", xlab = "", ylab = "Count")
legend("topright", rownames(stack_data), cex = .625, fill = c("red","green"), xpd=TRUE, inset=c(0, -.15), horiz = T)

3.4) Bivariate distribution (count) and Chi-squared test

# Function for cross tabulation (count)
cross_tab_count <- function(x,y)
{
  temp_table <- table(x,y)
  temp_col <- colSums(temp_table)
  temp_row <- rowSums(temp_table)
  temp_sum <- sum(temp_table)
  temp <- as.data.frame(rbind(cbind(temp_table,temp_row),c(temp_col,temp_sum)))
  rownames(temp)[nrow(temp)] <- "total"
  colnames(temp)[ncol(temp)] <- "total"
  return(temp)
}

# 3.4.1) Gender and smoker
# Cross tabulation
cross_tab_count(data$sex,data$smoker)
##          no yes total
## female  547 115   662
## male    517 159   676
## total  1064 274  1338
# Chi-squared test
chisq.test(data$sex,data$smoker,correct = F)
## 
##  Pearson's Chi-squared test
## 
## data:  data$sex and data$smoker
## X-squared = 7.7659, df = 1, p-value = 0.005324
# 3.4.2) Children and smoker
# Cross tabulation
cross_tab_count(data$children,data$smoker)
##         no yes total
## 0      459 115   574
## 1      263  61   324
## 2      185  55   240
## 3      118  39   157
## 4       22   3    25
## 5       17   1    18
## total 1064 274  1338
# Chi-squared test
chisq.test(data$children,data$smoker)
## 
##  Pearson's Chi-squared test
## 
## data:  data$children and data$smoker
## X-squared = 6.8877, df = 5, p-value = 0.2291
# 3.4.3) Region and smoker
# Cross tabulation
cross_tab_count(data$region,data$smoker)
##             no yes total
## northeast  257  67   324
## northwest  267  58   325
## southeast  273  91   364
## southwest  267  58   325
## total     1064 274  1338
# Chi-squared test
chisq.test(data$region,data$smoker)
## 
##  Pearson's Chi-squared test
## 
## data:  data$region and data$smoker
## X-squared = 7.3435, df = 3, p-value = 0.06172

4) Continuous data analysis

4.1) Basic statistics

# Mean, standard deviation, minimum, maximum, range, quartiles, IQR, skewness and kurtosis
statistics <- function(x)
{
  mean <- mean(x)
  standard_deviation <- sd(x)
  min <- min(x)
  max <- max(x)
  range <- max - min
  first_quartile <- quantile(x)[2]
  third_quartile <- quantile(x)[4]
  IQR <- third_quartile - first_quartile
  skewness <- mean((x-mean(x))^3)/(sd(x)^3)
  kurtosis <- (mean((x-mean(x))^4))/sd(x)^4
  
  return(cat("\t","Mean: ",mean,"\n\t","Standard deviation: ",standard_deviation,"\n\t","Minimum value: ",min,"\n\t","Maximum value: ",max,"\n\t","Range: ",range,"\n\t","First Quartile: ",first_quartile,"\n\t","Third Quartile: ",third_quartile,"\n\t","Interquartile Range: ",IQR,"\n\t","Skewness: ",skewness,"\n\t","Kurtosis: ",kurtosis))
}
# 4.1.1) Age
statistics(data$age)
##   Mean:  39.20703 
##   Standard deviation:  14.04996 
##   Minimum value:  18 
##   Maximum value:  64 
##   Range:  46 
##   First Quartile:  27 
##   Third Quartile:  51 
##   Interquartile Range:  24 
##   Skewness:  0.05554775 
##   Kurtosis:  1.752457
# 4.1.2) BMI
statistics(data$bmi)
##   Mean:  30.6634 
##   Standard deviation:  6.098187 
##   Minimum value:  15.96 
##   Maximum value:  53.13 
##   Range:  37.17 
##   First Quartile:  26.29625 
##   Third Quartile:  34.69375 
##   Interquartile Range:  8.3975 
##   Skewness:  0.2834106 
##   Kurtosis:  2.940576
# 4.1.3) Charges
statistics(data$charges)
##   Mean:  13270.42 
##   Standard deviation:  12110.01 
##   Minimum value:  1121.874 
##   Maximum value:  63770.43 
##   Range:  62648.55 
##   First Quartile:  4740.287 
##   Third Quartile:  16639.91 
##   Interquartile Range:  11899.63 
##   Skewness:  1.512483 
##   Kurtosis:  4.588954

5) Mixed data analysis

5.1) Data classification

# Spoken Tutorials:
# 22. Conditional Statements

# 5.1.1) BMI

# Source: https://www.cdc.gov/healthyweight/assessing/bmi/adult_bmi/index.html
bmi_selection <- function(x)
{
  if(x < 30){return("Non-obese")}else 
        if(x >= 30){return("Obese")}
}
bmi_category <- NULL
for(i in 1:nrow(data))
{
  bmi_category <- append(bmi_category,bmi_selection(data$bmi[i]))
}
data <- cbind(data,bmi_category)
head(data)
##   age    sex    bmi children smoker    region   charges bmi_category
## 1  19 female 27.900        0    yes southwest 16884.924    Non-obese
## 2  18   male 33.770        1     no southeast  1725.552        Obese
## 3  28   male 33.000        3     no southeast  4449.462        Obese
## 4  33   male 22.705        0     no northwest 21984.471    Non-obese
## 5  32   male 28.880        0     no northwest  3866.855    Non-obese
## 6  31 female 25.740        0     no southeast  3756.622    Non-obese
# 5.1.2) Age

# Source: https://www.census.gov/prod/cen2010/briefs/c2010br-03.pdf
age_selection <- function(x)
{
  if(x < 18){return("Under 18 years")}else
    if(x < 45){return("18 to 44 years")}else
      if(x < 65){return("45 to 64 years")}else
        if(x > 65){return("65 years and over")}
}
age_category <- NULL
for(i in 1:nrow(data))
{
  age_category <- append(age_category,age_selection(data$age[i]))
}
data <- cbind(data,age_category)
head(data)
##   age    sex    bmi children smoker    region   charges bmi_category
## 1  19 female 27.900        0    yes southwest 16884.924    Non-obese
## 2  18   male 33.770        1     no southeast  1725.552        Obese
## 3  28   male 33.000        3     no southeast  4449.462        Obese
## 4  33   male 22.705        0     no northwest 21984.471    Non-obese
## 5  32   male 28.880        0     no northwest  3866.855    Non-obese
## 6  31 female 25.740        0     no southeast  3756.622    Non-obese
##     age_category
## 1 18 to 44 years
## 2 18 to 44 years
## 3 18 to 44 years
## 4 18 to 44 years
## 5 18 to 44 years
## 6 18 to 44 years

5.2) Pivot tables

# 5.2.1) Table of age categories and gender containing mean of charges
pivot(data,mean,charges,age_category,sex) # "age_category" is significant.
Table: mean of charges 

 sex             female     male         
   age_category                      
---------------  ---------  ---------
 18 to 44 years   10087.52   11451.60
 45 to 64 years   16241.54   17915.27
# 5.2.2) Table of smoker and gender containing mean of charges
pivot(data,mean,charges,smoker,sex) # "smoker" is significant.
Table: mean of charges 

 sex     female    male         
 smoker                     
-------  --------  ---------
     no    8762.3    8087.20
    yes   30679.0   33042.01
# 5.2.3) Table of smoker and age categories containing mean of charges
pivot(data,mean,charges,smoker,age_category) # Both "smoker" and "age_category" are significant.
Table: mean of charges 

 age_category  18 to 44 ye45 to 64 years        
 smoker                            
-------------  ---------  ---------
           no    5621.98   12548.54
          yes   29222.85   37209.48
# 5.2.4) Table of smoker and region containing mean of charges
pivot(data,mean,charges,smoker,region) # "smoker" is significant.
Table: mean of charges 

 region  northeast  northwest  southeast  southwest       
 smoker                                            
-------  ---------  ---------  ---------  ---------
     no    9165.53    8556.46    8032.22    8019.28
    yes   29673.54   30192.00   34845.00   32269.06
# 5.2.5) Table of smoker and children containing mean of charges
pivot(data,mean,charges,smoker,children) # "smoker" is significant.
Table: mean of charges 

 children  0          1          2          3          4          5                 
 smoker                                                                    
---------  ---------  ---------  ---------  ---------  ---------  ---------
       no    7611.79    8303.11    9493.09    9614.52   12121.34    8183.85
      yes   31341.36   31822.65   33844.24   32724.92   26532.28   19023.26
# 5.2.6) Table of smoker and bmi categories containing mean of charges
pivot(data,mean,charges,smoker,bmi_category) # "smoker" is significant but "bmi_category" is significant for smokers.
Table: mean of charges 

 bmi_category  Non-obese  Obese                 
 smoker                            
-------------  ---------  ---------
           no    7977.03    8842.69
          yes   21363.22   41557.99
# 5.2.7) Table of age categories and bmi categories containing mean of charges
pivot(data,mean,charges,age_category,bmi_category) # Both "age_category" and "bmi_category" are significant.
Table: mean of charges 

 bmi_category    Non-obese Obese                 
   age_category                     
---------------  --------  ---------
 18 to 44 years    8484.8   13161.52
 45 to 64 years   14877.6   18631.77
# Significant variables
# 1) age_category
# 2) smoker
# 3) bmi_category

6) Saving modified data

write.csv(data,"updated_insurance.csv",row.names = F)