3) Discrete data analysis
3.1) Univariate frequency/proportion distribution, bar plot and pie chart
# Spoken Tutorials:
# 9. Indexing and Slicing Data Frames
# 15. Plotting Histograms and Pie Chart
# 16. Plotting Bar Charts and Scatter Plot
# Color source: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
# 3.1.1) Gender
# Univariate frequency/proportion distribution
prop.table(table(data$sex))
##
## female male
## 0.4947683 0.5052317
# Bar plot
barplot(height = prop.table(table(data$sex)), col = c("green","blue"), main = "Gender", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(data$sex)), labels = names(prop.table(table(data$sex))), col = c("green","blue"), radius = 1, main = "Gender")

# 3.1.2) Children
# Univariate frequency/proportion distribution
prop.table(table(data$children))
##
## 0 1 2 3 4 5
## 0.42899851 0.24215247 0.17937220 0.11733931 0.01868460 0.01345291
# Bar plot
barplot(height = prop.table(table(data$children)), col = c("darkolivegreen1","green","limegreen","lawngreen","darkseagreen4","forestgreen"), main = "Children", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(data$children)), labels = names(prop.table(table(data$children))), col = c("darkolivegreen1","green","limegreen","lawngreen","darkseagreen4","forestgreen"), radius = 1, main = "Children")

# 3.1.3) Smoker
# Univariate frequency/proportion distribution
prop.table(table(data$smoker))
##
## no yes
## 0.7952167 0.2047833
# Bar plot
barplot(height = prop.table(table(data$smoker)), col = c("lightcyan","grey"), main = "Smoker", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(data$smoker)), labels = names(prop.table(table(data$smoker))), col = c("lightcyan","grey"), radius = 1, main = "Smoker")

# 3.1.4) Region
# Univariate frequency/proportion distribution
prop.table(table(data$region))
##
## northeast northwest southeast southwest
## 0.2421525 0.2428999 0.2720478 0.2428999
# Bar plot
barplot(height = prop.table(table(data$region)), col = c("blue","slateblue1","steelblue2"), main = "Region", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(data$region)), labels = names(prop.table(table(data$region))), col = c("blue","slateblue1","steelblue2","darkviolet"), radius = 1, main = "Region")

3.2) Bivariate distribution and grouped bar plot
# Spoken Tutorials:
# 23. Functions in R
# Function for cross tabulation (proportion)
cross_tab_proportion <- function(x,y)
{
temp_table <- prop.table(table(x,y))
temp_col <- colSums(temp_table)
temp_row <- rowSums(temp_table)
temp_sum <- sum(temp_table)
temp <- as.data.frame(rbind(cbind(temp_table,temp_row),c(temp_col,temp_sum)))
rownames(temp)[nrow(temp)] <- "total"
colnames(temp)[ncol(temp)] <- "total"
return(temp)
}
# 3.2.1) Gender and smoker
# Cross tabulation
cross_tab_proportion(data$sex,data$smoker)
## no yes total
## female 0.4088191 0.08594918 0.4947683
## male 0.3863976 0.11883408 0.5052317
## total 0.7952167 0.20478326 1.0000000
# Grouped bar plot
table_data <- prop.table(table(data$sex,data$smoker))
barplot(table_data, main = "Gender", xlab = "", ylab = "Proportion",
col = c("blue", "red", "blue", "red"), beside = TRUE)
legend("topright", rownames(table_data), fill = c("blue", "red", "blue", "red"), xpd=TRUE, inset=c(0, -.1), horiz = T, bty = "n")

# 3.2.2) Children and smoker
# Cross tabulation
cross_tab_proportion(data$children,data$smoker)
## no yes total
## 0 0.34304933 0.0859491779 0.42899851
## 1 0.19656203 0.0455904335 0.24215247
## 2 0.13826607 0.0411061286 0.17937220
## 3 0.08819133 0.0291479821 0.11733931
## 4 0.01644245 0.0022421525 0.01868460
## 5 0.01270553 0.0007473842 0.01345291
## total 0.79521674 0.2047832586 1.00000000
# Grouped bar plot
table_data <- prop.table(table(data$children,data$smoker))
barplot(table_data, main = "Children", xlab = "", ylab = "Proportion",
col = c("dodgerblue","blue","slateblue1","steelblue2","darkviolet","deepskyblue4"), beside = TRUE)
legend("topright", rownames(table_data), fill = c("dodgerblue","blue","slateblue1","steelblue2","darkviolet","deepskyblue4"), xpd=TRUE, inset=c(0, -.1), horiz = T, bty = "n")

# 3.2.3) Region and smoker
# Cross tabulation
cross_tab_proportion(data$region,data$smoker)
## no yes total
## northeast 0.1920777 0.05007474 0.2421525
## northwest 0.1995516 0.04334828 0.2428999
## southeast 0.2040359 0.06801196 0.2720478
## southwest 0.1995516 0.04334828 0.2428999
## total 0.7952167 0.20478326 1.0000000
# Grouped bar plot
table_data <- prop.table(table(data$region,data$smoker))
barplot(table_data, main = "Region", xlab = "", ylab = "Proportion",
col = c("darkorange4","peru","tan1","sandybrown"), beside = TRUE)
legend("topright", rownames(table_data), fill = c("darkorange4","peru","tan1","sandybrown"), xpd=TRUE, inset=c(0, -.3), ncol = 2, bty = "n")

3.3) Stacked bar plot
# Spoken Tutorials:
# 19. Data Manipulation using dplyr Package
# 20. More Functions in dplyr Package
# 21. Pipe Operator
# 3.3.1) Gender and smoker
male <- data %>% filter(sex=="male")
female <- data %>% filter(sex=="female")
stack_data <- prop.table(table(data$sex,data$smoker))
rownames(stack_data) <- c("male","female")
barplot(stack_data, col=c("red","green"), main = "Gender", xlab = "", ylab = "Count")
legend("topright", rownames(stack_data), cex = .625, fill = c("red","green"), xpd=TRUE, inset=c(0, -.15), horiz = T)

3.4) Bivariate distribution (count) and Chi-squared test
# Function for cross tabulation (count)
cross_tab_count <- function(x,y)
{
temp_table <- table(x,y)
temp_col <- colSums(temp_table)
temp_row <- rowSums(temp_table)
temp_sum <- sum(temp_table)
temp <- as.data.frame(rbind(cbind(temp_table,temp_row),c(temp_col,temp_sum)))
rownames(temp)[nrow(temp)] <- "total"
colnames(temp)[ncol(temp)] <- "total"
return(temp)
}
# 3.4.1) Gender and smoker
# Cross tabulation
cross_tab_count(data$sex,data$smoker)
## no yes total
## female 547 115 662
## male 517 159 676
## total 1064 274 1338
# Chi-squared test
chisq.test(data$sex,data$smoker,correct = F)
##
## Pearson's Chi-squared test
##
## data: data$sex and data$smoker
## X-squared = 7.7659, df = 1, p-value = 0.005324
# 3.4.2) Children and smoker
# Cross tabulation
cross_tab_count(data$children,data$smoker)
## no yes total
## 0 459 115 574
## 1 263 61 324
## 2 185 55 240
## 3 118 39 157
## 4 22 3 25
## 5 17 1 18
## total 1064 274 1338
# Chi-squared test
chisq.test(data$children,data$smoker)
##
## Pearson's Chi-squared test
##
## data: data$children and data$smoker
## X-squared = 6.8877, df = 5, p-value = 0.2291
# 3.4.3) Region and smoker
# Cross tabulation
cross_tab_count(data$region,data$smoker)
## no yes total
## northeast 257 67 324
## northwest 267 58 325
## southeast 273 91 364
## southwest 267 58 325
## total 1064 274 1338
# Chi-squared test
chisq.test(data$region,data$smoker)
##
## Pearson's Chi-squared test
##
## data: data$region and data$smoker
## X-squared = 7.3435, df = 3, p-value = 0.06172
4) Continuous data analysis
4.1) Basic statistics
# Mean, standard deviation, minimum, maximum, range, quartiles, IQR, skewness and kurtosis
statistics <- function(x)
{
mean <- mean(x)
standard_deviation <- sd(x)
min <- min(x)
max <- max(x)
range <- max - min
first_quartile <- quantile(x)[2]
third_quartile <- quantile(x)[4]
IQR <- third_quartile - first_quartile
skewness <- mean((x-mean(x))^3)/(sd(x)^3)
kurtosis <- (mean((x-mean(x))^4))/sd(x)^4
return(cat("\t","Mean: ",mean,"\n\t","Standard deviation: ",standard_deviation,"\n\t","Minimum value: ",min,"\n\t","Maximum value: ",max,"\n\t","Range: ",range,"\n\t","First Quartile: ",first_quartile,"\n\t","Third Quartile: ",third_quartile,"\n\t","Interquartile Range: ",IQR,"\n\t","Skewness: ",skewness,"\n\t","Kurtosis: ",kurtosis))
}
# 4.1.1) Age
statistics(data$age)
## Mean: 39.20703
## Standard deviation: 14.04996
## Minimum value: 18
## Maximum value: 64
## Range: 46
## First Quartile: 27
## Third Quartile: 51
## Interquartile Range: 24
## Skewness: 0.05554775
## Kurtosis: 1.752457
# 4.1.2) BMI
statistics(data$bmi)
## Mean: 30.6634
## Standard deviation: 6.098187
## Minimum value: 15.96
## Maximum value: 53.13
## Range: 37.17
## First Quartile: 26.29625
## Third Quartile: 34.69375
## Interquartile Range: 8.3975
## Skewness: 0.2834106
## Kurtosis: 2.940576
# 4.1.3) Charges
statistics(data$charges)
## Mean: 13270.42
## Standard deviation: 12110.01
## Minimum value: 1121.874
## Maximum value: 63770.43
## Range: 62648.55
## First Quartile: 4740.287
## Third Quartile: 16639.91
## Interquartile Range: 11899.63
## Skewness: 1.512483
## Kurtosis: 4.588954
5) Mixed data analysis
5.1) Data classification
# Spoken Tutorials:
# 22. Conditional Statements
# 5.1.1) BMI
# Source: https://www.cdc.gov/healthyweight/assessing/bmi/adult_bmi/index.html
bmi_selection <- function(x)
{
if(x < 30){return("Non-obese")}else
if(x >= 30){return("Obese")}
}
bmi_category <- NULL
for(i in 1:nrow(data))
{
bmi_category <- append(bmi_category,bmi_selection(data$bmi[i]))
}
data <- cbind(data,bmi_category)
head(data)
## age sex bmi children smoker region charges bmi_category
## 1 19 female 27.900 0 yes southwest 16884.924 Non-obese
## 2 18 male 33.770 1 no southeast 1725.552 Obese
## 3 28 male 33.000 3 no southeast 4449.462 Obese
## 4 33 male 22.705 0 no northwest 21984.471 Non-obese
## 5 32 male 28.880 0 no northwest 3866.855 Non-obese
## 6 31 female 25.740 0 no southeast 3756.622 Non-obese
# 5.1.2) Age
# Source: https://www.census.gov/prod/cen2010/briefs/c2010br-03.pdf
age_selection <- function(x)
{
if(x < 18){return("Under 18 years")}else
if(x < 45){return("18 to 44 years")}else
if(x < 65){return("45 to 64 years")}else
if(x > 65){return("65 years and over")}
}
age_category <- NULL
for(i in 1:nrow(data))
{
age_category <- append(age_category,age_selection(data$age[i]))
}
data <- cbind(data,age_category)
head(data)
## age sex bmi children smoker region charges bmi_category
## 1 19 female 27.900 0 yes southwest 16884.924 Non-obese
## 2 18 male 33.770 1 no southeast 1725.552 Obese
## 3 28 male 33.000 3 no southeast 4449.462 Obese
## 4 33 male 22.705 0 no northwest 21984.471 Non-obese
## 5 32 male 28.880 0 no northwest 3866.855 Non-obese
## 6 31 female 25.740 0 no southeast 3756.622 Non-obese
## age_category
## 1 18 to 44 years
## 2 18 to 44 years
## 3 18 to 44 years
## 4 18 to 44 years
## 5 18 to 44 years
## 6 18 to 44 years
5.2) Pivot tables
# 5.2.1) Table of age categories and gender containing mean of charges
pivot(data,mean,charges,age_category,sex) # "age_category" is significant.
Table: mean of charges
sex female male
age_category
--------------- --------- ---------
18 to 44 years 10087.52 11451.60
45 to 64 years 16241.54 17915.27
# 5.2.2) Table of smoker and gender containing mean of charges
pivot(data,mean,charges,smoker,sex) # "smoker" is significant.
Table: mean of charges
sex female male
smoker
------- -------- ---------
no 8762.3 8087.20
yes 30679.0 33042.01
# 5.2.3) Table of smoker and age categories containing mean of charges
pivot(data,mean,charges,smoker,age_category) # Both "smoker" and "age_category" are significant.
Table: mean of charges
age_category 18 to 44 ye45 to 64 years
smoker
------------- --------- ---------
no 5621.98 12548.54
yes 29222.85 37209.48
# 5.2.4) Table of smoker and region containing mean of charges
pivot(data,mean,charges,smoker,region) # "smoker" is significant.
Table: mean of charges
region northeast northwest southeast southwest
smoker
------- --------- --------- --------- ---------
no 9165.53 8556.46 8032.22 8019.28
yes 29673.54 30192.00 34845.00 32269.06
# 5.2.5) Table of smoker and children containing mean of charges
pivot(data,mean,charges,smoker,children) # "smoker" is significant.
Table: mean of charges
children 0 1 2 3 4 5
smoker
--------- --------- --------- --------- --------- --------- ---------
no 7611.79 8303.11 9493.09 9614.52 12121.34 8183.85
yes 31341.36 31822.65 33844.24 32724.92 26532.28 19023.26
# 5.2.6) Table of smoker and bmi categories containing mean of charges
pivot(data,mean,charges,smoker,bmi_category) # "smoker" is significant but "bmi_category" is significant for smokers.
Table: mean of charges
bmi_category Non-obese Obese
smoker
------------- --------- ---------
no 7977.03 8842.69
yes 21363.22 41557.99
# 5.2.7) Table of age categories and bmi categories containing mean of charges
pivot(data,mean,charges,age_category,bmi_category) # Both "age_category" and "bmi_category" are significant.
Table: mean of charges
bmi_category Non-obese Obese
age_category
--------------- -------- ---------
18 to 44 years 8484.8 13161.52
45 to 64 years 14877.6 18631.77
# Significant variables
# 1) age_category
# 2) smoker
# 3) bmi_category