Loading libraries

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(readxl))
suppressPackageStartupMessages(library(lessR))

Reading data

# Source: https://archive.ics.uci.edu/ml/datasets/heart+disease
heart <- read_xlsx("heart.xlsx")
heart$age <- as.numeric(heart$age)
heart$trestbps <- as.numeric(heart$trestbps)
heart$chol <- as.numeric(heart$chol)
heart$thalach <- as.numeric(heart$thalach)
heart$oldpeak <- as.numeric(heart$oldpeak)
heart$ca <- as.numeric(heart$ca)

Discrete Variables

1) Univariate probablity mass function and visualization using bar plot & pie chart

1.1) P.M.F. of the variable “sex” (gender) and it’s visualization

# Spoken Tutorials:
# 15. Plotting Histograms and Pie Chart 
# 16. Plotting Bar Charts and Scatter Plot

prop.table(table(heart$sex))
## 
##       fem      male 
## 0.3209459 0.6790541
# Bar plot
barplot(height = prop.table(table(heart$sex)), col = c("green","blue"), main = "Gender", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(heart$sex)), labels = names(prop.table(table(heart$sex))), col = c("green","blue"), radius = 1, main = "Gender")

1.2) P.M.F. of the variable “cp” (chest pain) and it’s visualization

prop.table(table(heart$cp))
## 
##    abnang    angina    asympt    notang 
## 0.1655405 0.0777027 0.4763514 0.2804054
# Bar plot
barplot(height = prop.table(table(heart$cp)), col = c("red","indianred1","violetred2","orangered3"), main = "Chest pain type", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(heart$cp)), labels = names(prop.table(table(heart$cp))), col = c("red","indianred1","violetred2","orangered3"), radius = 1, main = "Chest pain type")

1.3) P.M.F. of the variable “restecg” (resting ECG) and it’s visualization

prop.table(table(heart$restecg))
## 
##        abn        hyp       norm 
## 0.01351351 0.48986486 0.49662162
# Bar plot
barplot(height = prop.table(table(heart$restecg)), col = c("blue","slateblue1","steelblue2"), main = "Resting electrocardiographic results", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(heart$restecg)), labels = names(prop.table(table(heart$restecg))), col = c("blue","slateblue1","steelblue2"), radius = 1, main = "Resting electrocardiographic results")

1.4) P.M.F. of the variable “slope” (slope of the peak exercise ST segment) and it’s visualization

prop.table(table(heart$slope))
## 
##       down       flat         up 
## 0.07094595 0.46283784 0.46621622
# Bar plot
barplot(height = prop.table(table(heart$slope)), col = c("green","limegreen","lawngreen"), main = "Slope of the peak exercise ST segment", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(heart$slope)), labels = names(prop.table(table(heart$slope))), col = c("green","limegreen","lawngreen"), radius = 1, main = "Slope of the peak exercise ST segment")

1.5) P.M.F. of the variable “thal” (thalasemia blood disorder) and it’s visualization

prop.table(table(heart$thal))
## 
##        fix       norm        rev 
## 0.06081081 0.55067568 0.38851351
# Bar plot
barplot(height = prop.table(table(heart$thal)), col = c("peru","tan1","sandybrown"), main = "Thal", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(heart$thal)), labels = names(prop.table(table(heart$thal))), col = c("peru","tan1","sandybrown"), radius = 1, main = "Thal")

1.6) P.M.F. of variable “ca” (number of major vessels) and it’s visualization

prop.table(table(heart$ca))
## 
##          0          1          2          3 
## 0.58445946 0.21959459 0.12837838 0.06756757
# Bar plot
barplot(height = prop.table(table(heart$ca)), col = c("grey","lightcyan","ivory","honeydew"), main = "Proportion of major vessels", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(heart$ca)), labels = names(prop.table(table(heart$ca))), col = c("grey","lightcyan","ivory","honeydew"), radius = 1, main = "Proportion of major vessels")

2) Bivariate joint probability mass function and visualization using grouped bar plot

# Spoken Tutorials:
# 23. Functions in R

# Function for cross tabulation (proportion)
cross_tab_proportion <- function(x,y)
{
  temp_table <- prop.table(table(x,y))
  temp_col <- colSums(temp_table)
  temp_row <- rowSums(temp_table)
  temp_sum <- sum(temp_table)
  temp <- as.data.frame(rbind(cbind(temp_table,temp_row),c(temp_col,temp_sum)))
  rownames(temp)[nrow(temp)] <- "total"
  colnames(temp)[ncol(temp)] <- "total"
  return(temp)
}

2.1) Bivariate joint p.m.f of gender and target along with grouped bar plot

cross_tab_proportion(heart$sex,heart$target)
##         healthy       sick     total
## fem   0.2398649 0.08108108 0.3209459
## male  0.3006757 0.37837838 0.6790541
## total 0.5405405 0.45945946 1.0000000
# Grouped bar plot
data <- prop.table(table(heart$target,heart$sex))
barplot(data, main = "Gender", xlab = "", ylab = "Proportion",
        col = c("blue", "red", "blue", "red"), beside = TRUE)
legend("topleft", rownames(data), fill = c("blue", "red"), xpd=TRUE, inset=c(0, -.1), horiz = T, bty = "n")

2.2) Bivariate joint p.m.f of chest pain and target along with grouped bar plot

cross_tab_proportion(heart$cp,heart$target)
##           healthy       sick     total
## abnang 0.13513514 0.03040541 0.1655405
## angina 0.05405405 0.02364865 0.0777027
## asympt 0.13175676 0.34459459 0.4763514
## notang 0.21959459 0.06081081 0.2804054
## total  0.54054054 0.45945946 1.0000000
# Grouped bar plot
data <- prop.table(table(heart$target,heart$cp))
barplot(data, main = "Chest pain type", xlab = "", ylab = "Proportion",
        col = c("blue", "red", "blue", "red"), beside = TRUE)
legend("topleft", rownames(data), fill = c("blue", "red"), xpd=TRUE, inset=c(0, -.1), horiz = T, bty = "n")

2.3) Bivariate joint p.m.f of resting ECG and target along with grouped bar plot

cross_tab_proportion(heart$restecg,heart$target)
##           healthy       sick      total
## abn   0.003378378 0.01013514 0.01351351
## hyp   0.226351351 0.26351351 0.48986486
## norm  0.310810811 0.18581081 0.49662162
## total 0.540540541 0.45945946 1.00000000
# Grouped bar plot
data <- prop.table(table(heart$target,heart$restecg))
barplot(data, main = "Resting electrocardiographic results", xlab = "", ylab = "Proportion",
        col = c("blue", "red", "blue", "red"), beside = TRUE)
legend("topleft", rownames(data), fill = c("blue", "red"), xpd=TRUE, inset=c(0, -.1), horiz = T, bty = "n")

2.4) Bivariate joint p.m.f of the slope of the peak exercise ST segment and target along with grouped bar plot

cross_tab_proportion(heart$slope,heart$target)
##          healthy       sick      total
## down  0.03040541 0.04054054 0.07094595
## flat  0.16216216 0.30067568 0.46283784
## up    0.34797297 0.11824324 0.46621622
## total 0.54054054 0.45945946 1.00000000
# Grouped bar plot
data <- prop.table(table(heart$target,heart$slope))
barplot(data, main = "Slope of the peak exercise ST segment", xlab = "", ylab = "Proportion",
        col = c("blue", "red", "blue", "red"), beside = TRUE)
legend("topleft", rownames(data), fill = c("blue", "red"), xpd=TRUE, inset=c(0, -.1), horiz = T, bty = "n")

2.5) Bivariate joint p.m.f of thalesemia blood disorder and target along with grouped bar plot

cross_tab_proportion(heart$thal,heart$target)
##          healthy       sick      total
## fix   0.02027027 0.04054054 0.06081081
## norm  0.42905405 0.12162162 0.55067568
## rev   0.09121622 0.29729730 0.38851351
## total 0.54054054 0.45945946 1.00000000
# Grouped bar plot
data <- prop.table(table(heart$target,heart$thal))
barplot(data, main = "Thal", xlab = "", ylab = "Proportion",
        col = c("blue", "red", "blue", "red"), beside = TRUE)
legend("topleft", rownames(data), fill = c("blue", "red"), xpd=TRUE, inset=c(0, -.1), horiz = T, bty = "n")

2.6) Bivariate joint p.m.f of the number of major vessels and target along with grouped bar plot

cross_tab_proportion(heart$ca,heart$target)
##          healthy       sick      total
## 0     0.43581081 0.14864865 0.58445946
## 1     0.07094595 0.14864865 0.21959459
## 2     0.02364865 0.10472973 0.12837838
## 3     0.01013514 0.05743243 0.06756757
## total 0.54054054 0.45945946 1.00000000
# Grouped bar plot
data <- prop.table(table(heart$target,heart$ca))
barplot(data, main = "Proportion of major vessels", xlab = "", ylab = "Proportion",
        col = c("blue", "red", "blue", "red"), beside = TRUE)
legend("topright", rownames(data), fill = c("blue", "red"), xpd=TRUE, inset=c(0, -.1), horiz = T, bty = "n")

3) \(\chi^2\) test: Testing association between “target” and other discrete variables

# Function for cross tabulation (count)
cross_tab_count <- function(x,y)
{
  temp_table <- table(x,y)
  temp_col <- colSums(temp_table)
  temp_row <- rowSums(temp_table)
  temp_sum <- sum(temp_table)
  temp <- as.data.frame(rbind(cbind(temp_table,temp_row),c(temp_col,temp_sum)))
  rownames(temp)[nrow(temp)] <- "total"
  colnames(temp)[ncol(temp)] <- "total"
  return(temp)
}

3.1) Testing association between gender and target

cross_tab_count(heart$sex,heart$target)
##       healthy sick total
## fem        71   24    95
## male       89  112   201
## total     160  136   296
# Chi-squared test
chisq.test(heart$sex,heart$target,correct = F)
## 
##  Pearson's Chi-squared test
## 
## data:  heart$sex and heart$target
## X-squared = 24.097, df = 1, p-value = 0.0000009161

3.2) Testing association between chest pain and target

cross_tab_count(heart$cp,heart$target)
##        healthy sick total
## abnang      40    9    49
## angina      16    7    23
## asympt      39  102   141
## notang      65   18    83
## total      160  136   296
# Chi-squared test
chisq.test(heart$cp,heart$target)
## 
##  Pearson's Chi-squared test
## 
## data:  heart$cp and heart$target
## X-squared = 76.454, df = 3, p-value < 0.00000000000000022

3.3) Testing association between resting ECG and target

cross_tab_count(heart$restecg,heart$target)
##       healthy sick total
## abn         1    3     4
## hyp        67   78   145
## norm       92   55   147
## total     160  136   296
# Chi-squared tests
chisq.test(heart$restecg,heart$target)
## 
##  Pearson's Chi-squared test
## 
## data:  heart$restecg and heart$target
## X-squared = 9.2624, df = 2, p-value = 0.009743

3.4) Testing association between the slope of the peak exercise ST segment and target

cross_tab_count(heart$slope,heart$target)
##       healthy sick total
## down        9   12    21
## flat       48   89   137
## up        103   35   138
## total     160  136   296
# Chi-squared test
chisq.test(heart$target,heart$slope)
## 
##  Pearson's Chi-squared test
## 
## data:  heart$target and heart$slope
## X-squared = 44.553, df = 2, p-value = 0.0000000002116

3.5) Testing association between thalsemia blood disorder and target

cross_tab_count(heart$thal,heart$target)
##       healthy sick total
## fix         6   12    18
## norm      127   36   163
## rev        27   88   115
## total     160  136   296
# Chi-squared test
chisq.test(heart$thal,heart$target)
## 
##  Pearson's Chi-squared test
## 
## data:  heart$thal and heart$target
## X-squared = 83.765, df = 2, p-value < 0.00000000000000022

3.6) Testing association between the number of major vessels and target

cross_tab_count(heart$ca,heart$target)
##       healthy sick total
## 0         129   44   173
## 1          21   44    65
## 2           7   31    38
## 3           3   17    20
## total     160  136   296
# Chi-squared test
chisq.test(heart$ca,heart$target)
## 
##  Pearson's Chi-squared test
## 
## data:  heart$ca and heart$target
## X-squared = 73.396, df = 3, p-value = 0.0000000000000007996

4) Testing of equality of two proportions

4.1) Testing whether the proportion of sick male and sick female are equal or not

# Spoken Tutorials:
# 19. Data Manipulation using dplyr Package
# 20. More Functions in dplyr Package
# 21. Pipe Operator

sick_female <- as.vector(table(heart %>% filter(target=="sick") %>% select(sex)))[1]
sick_male <- as.vector(table(heart %>% filter(target=="sick") %>% select(sex)))[2]
total_female <- length(unlist(heart %>% select(sex) %>% filter(sex=="fem")))
total_male <- length(unlist(heart %>% select(sex) %>% filter(sex=="male")))
proportion_test <- function(n1,N1,n2,N2)
{
  p1 <- n1/N1
  p2 <- n2/N2
  p  <- (n1+n2)/(N1+N2)
  se <- sqrt(p*(1-p)*((1/N1)+1/(N2)))
  result <- (p1-p2)/se
  return(cat("p1:",p1,"p2:",p2,"z-statistic:",result,"p-value:",c(2*(1-pnorm(abs(result))))))
}
proportion_test(sick_female,total_female,sick_male,total_male)
## p1: 0.2526316 p2: 0.5572139 z-statistic: -4.908864 p-value: 0.000000916056

4.2) Testing equality of two proportions by using a built-in R function

sick_female <- as.vector(table(heart %>% filter(target=="sick") %>% select(sex)))[1]
sick_male <- as.vector(table(heart %>% filter(target=="sick") %>% select(sex)))[2]
total_female <- length(unlist(heart %>% select(sex) %>% filter(sex=="fem")))
total_male <- length(unlist(heart %>% select(sex) %>% filter(sex=="male")))
prop.test(x=c(sick_female,sick_male), n=c(total_female,total_male),alternative = "two.sided", correct = FALSE)
## 
##  2-sample test for equality of proportions without continuity
##  correction
## 
## data:  c(sick_female, sick_male) out of c(total_female, total_male)
## X-squared = 24.097, df = 1, p-value = 0.0000009161
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.4157135 -0.1934512
## sample estimates:
##    prop 1    prop 2 
## 0.2526316 0.5572139

Continuous variables

5) Basic statistics of continuous variables

# Mean, standard deviation, minimum, maximum, range, quartiles, IQR, skewness and kurtosis
statistics <- function(x)
{
  mean <- mean(x)
  standard_deviation <- sd(x)
  min <- min(x)
  max <- max(x)
  range <- max - min
  first_quartile <- quantile(x)[2]
  third_quartile <- quantile(x)[4]
  IQR <- third_quartile - first_quartile
  skewness <- mean((x-mean(x))^3)/(sd(x)^3)
  kurtosis <- (mean((x-mean(x))^4))/sd(x)^4
  
  return(cat("\t","Mean: ",mean,"\n\t","Standard deviation: ",standard_deviation,"\n\t","Minimum value: ",min,"\n\t","Maximum value: ",max,"\n\t","Range: ",range,"\n\t","First Quartile: ",first_quartile,"\n\t","Third Quartile: ",third_quartile,"\n\t","Interquartile Range: ",IQR,"\n\t","Skewness: ",skewness,"\n\t","Kurtosis: ",kurtosis))
}

5.1) Basic statistics of the variable “age”

statistics(heart$age)
##   Mean:  54.52365 
##   Standard deviation:  9.059471 
##   Minimum value:  29 
##   Maximum value:  77 
##   Range:  48 
##   First Quartile:  48 
##   Third Quartile:  61 
##   Interquartile Range:  13 
##   Skewness:  -0.212245 
##   Kurtosis:  2.4458

5.2) Basic statistics of the variable “chol” (Cholestrol)

statistics(heart$chol)
##   Mean:  247.1554 
##   Standard deviation:  51.97701 
##   Minimum value:  126 
##   Maximum value:  564 
##   Range:  438 
##   First Quartile:  211 
##   Third Quartile:  275.25 
##   Interquartile Range:  64.25 
##   Skewness:  1.118479 
##   Kurtosis:  7.347904

5.3) Basic statistics of the variable “thalach” (Maximum heart rate achieved)

statistics(heart$thalach)
##   Mean:  149.5608 
##   Standard deviation:  22.97079 
##   Minimum value:  71 
##   Maximum value:  202 
##   Range:  131 
##   First Quartile:  133 
##   Third Quartile:  166 
##   Interquartile Range:  33 
##   Skewness:  -0.526295 
##   Kurtosis:  2.900443

5.4) Basic statistics of the variable “oldpeak” (ST depression induced by exercise relative to rest)

statistics(heart$oldpeak)
##   Mean:  1.059122 
##   Standard deviation:  1.166474 
##   Minimum value:  0 
##   Maximum value:  6.2 
##   Range:  6.2 
##   First Quartile:  0 
##   Third Quartile:  1.65 
##   Interquartile Range:  1.65 
##   Skewness:  1.230977 
##   Kurtosis:  4.427748

6) Testing equality of mean of two population

6.1) Testing equality of the mean of “chol” among sick and healty group

# Test of equality of mean
chol_sick <- unlist(heart %>% filter(target=="sick") %>% select(chol))
chol_healthy <- unlist(heart %>% filter(target=="healthy") %>% select(chol))
mean_test <- function(x1,x2)
{
  x_bar_1 <- mean(x1)
  x_bar_2 <- mean(x2)
  n1=length(x1)
  n2=length(x2)
  sd_1 <- sd(x1)
  sd_2 <- sd(x2)
  se <- sqrt(((n1-1)*c(sd_1^2)+(n2-1)*c(sd_2^2))/(n1+n2-2))*sqrt(c(1/n1)+c(1/n2))
  t_stat <- (x_bar_1-x_bar_2)/se
  return(cat("x_bar 1:",x_bar_1,"x_bar 2:",x_bar_2,"t-statistic:",t_stat,"se:", se,"df:",c(n1+n2-2),"p-value:",2*c(1-pt(abs(t_stat),df=c(n1+n2-2)))))
}
mean_test(chol_sick,chol_healthy)
## x_bar 1: 251.4632 x_bar 2: 243.4938 t-statistic: 1.316258 se: 6.054652 df: 294 p-value: 0.1891126

6.2) Testing equality of mean using a built-in R function

chol_sick <- unlist(heart %>% filter(target=="sick") %>% select(chol))
chol_healthy <- unlist(heart %>% filter(target=="healthy") %>% select(chol))
A=t.test(chol_sick,chol_healthy,var.equal=T)
A
## 
##  Two Sample t-test
## 
## data:  chol_sick and chol_healthy
## t = 1.3163, df = 294, p-value = 0.1891
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.946467 19.885438
## sample estimates:
## mean of x mean of y 
##  251.4632  243.4938