1) Univariate probablity mass function and visualization using bar plot & pie chart
1.1) P.M.F. of the variable “sex” (gender) and it’s visualization
# Spoken Tutorials:
# 15. Plotting Histograms and Pie Chart
# 16. Plotting Bar Charts and Scatter Plot
prop.table(table(heart$sex))
##
## fem male
## 0.3209459 0.6790541
# Bar plot
barplot(height = prop.table(table(heart$sex)), col = c("green","blue"), main = "Gender", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(heart$sex)), labels = names(prop.table(table(heart$sex))), col = c("green","blue"), radius = 1, main = "Gender")

1.2) P.M.F. of the variable “cp” (chest pain) and it’s visualization
prop.table(table(heart$cp))
##
## abnang angina asympt notang
## 0.1655405 0.0777027 0.4763514 0.2804054
# Bar plot
barplot(height = prop.table(table(heart$cp)), col = c("red","indianred1","violetred2","orangered3"), main = "Chest pain type", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(heart$cp)), labels = names(prop.table(table(heart$cp))), col = c("red","indianred1","violetred2","orangered3"), radius = 1, main = "Chest pain type")

1.3) P.M.F. of the variable “restecg” (resting ECG) and it’s visualization
prop.table(table(heart$restecg))
##
## abn hyp norm
## 0.01351351 0.48986486 0.49662162
# Bar plot
barplot(height = prop.table(table(heart$restecg)), col = c("blue","slateblue1","steelblue2"), main = "Resting electrocardiographic results", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(heart$restecg)), labels = names(prop.table(table(heart$restecg))), col = c("blue","slateblue1","steelblue2"), radius = 1, main = "Resting electrocardiographic results")

1.4) P.M.F. of the variable “slope” (slope of the peak exercise ST segment) and it’s visualization
prop.table(table(heart$slope))
##
## down flat up
## 0.07094595 0.46283784 0.46621622
# Bar plot
barplot(height = prop.table(table(heart$slope)), col = c("green","limegreen","lawngreen"), main = "Slope of the peak exercise ST segment", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(heart$slope)), labels = names(prop.table(table(heart$slope))), col = c("green","limegreen","lawngreen"), radius = 1, main = "Slope of the peak exercise ST segment")

1.5) P.M.F. of the variable “thal” (thalasemia blood disorder) and it’s visualization
prop.table(table(heart$thal))
##
## fix norm rev
## 0.06081081 0.55067568 0.38851351
# Bar plot
barplot(height = prop.table(table(heart$thal)), col = c("peru","tan1","sandybrown"), main = "Thal", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(heart$thal)), labels = names(prop.table(table(heart$thal))), col = c("peru","tan1","sandybrown"), radius = 1, main = "Thal")

1.6) P.M.F. of variable “ca” (number of major vessels) and it’s visualization
prop.table(table(heart$ca))
##
## 0 1 2 3
## 0.58445946 0.21959459 0.12837838 0.06756757
# Bar plot
barplot(height = prop.table(table(heart$ca)), col = c("grey","lightcyan","ivory","honeydew"), main = "Proportion of major vessels", xlab = "", ylab = "Proportion")

# Pie chart
pie(x = prop.table(table(heart$ca)), labels = names(prop.table(table(heart$ca))), col = c("grey","lightcyan","ivory","honeydew"), radius = 1, main = "Proportion of major vessels")

2) Bivariate joint probability mass function and visualization using grouped bar plot
# Spoken Tutorials:
# 23. Functions in R
# Function for cross tabulation (proportion)
cross_tab_proportion <- function(x,y)
{
temp_table <- prop.table(table(x,y))
temp_col <- colSums(temp_table)
temp_row <- rowSums(temp_table)
temp_sum <- sum(temp_table)
temp <- as.data.frame(rbind(cbind(temp_table,temp_row),c(temp_col,temp_sum)))
rownames(temp)[nrow(temp)] <- "total"
colnames(temp)[ncol(temp)] <- "total"
return(temp)
}
2.1) Bivariate joint p.m.f of gender and target along with grouped bar plot
cross_tab_proportion(heart$sex,heart$target)
## healthy sick total
## fem 0.2398649 0.08108108 0.3209459
## male 0.3006757 0.37837838 0.6790541
## total 0.5405405 0.45945946 1.0000000
# Grouped bar plot
data <- prop.table(table(heart$target,heart$sex))
barplot(data, main = "Gender", xlab = "", ylab = "Proportion",
col = c("blue", "red", "blue", "red"), beside = TRUE)
legend("topleft", rownames(data), fill = c("blue", "red"), xpd=TRUE, inset=c(0, -.1), horiz = T, bty = "n")

2.2) Bivariate joint p.m.f of chest pain and target along with grouped bar plot
cross_tab_proportion(heart$cp,heart$target)
## healthy sick total
## abnang 0.13513514 0.03040541 0.1655405
## angina 0.05405405 0.02364865 0.0777027
## asympt 0.13175676 0.34459459 0.4763514
## notang 0.21959459 0.06081081 0.2804054
## total 0.54054054 0.45945946 1.0000000
# Grouped bar plot
data <- prop.table(table(heart$target,heart$cp))
barplot(data, main = "Chest pain type", xlab = "", ylab = "Proportion",
col = c("blue", "red", "blue", "red"), beside = TRUE)
legend("topleft", rownames(data), fill = c("blue", "red"), xpd=TRUE, inset=c(0, -.1), horiz = T, bty = "n")

2.3) Bivariate joint p.m.f of resting ECG and target along with grouped bar plot
cross_tab_proportion(heart$restecg,heart$target)
## healthy sick total
## abn 0.003378378 0.01013514 0.01351351
## hyp 0.226351351 0.26351351 0.48986486
## norm 0.310810811 0.18581081 0.49662162
## total 0.540540541 0.45945946 1.00000000
# Grouped bar plot
data <- prop.table(table(heart$target,heart$restecg))
barplot(data, main = "Resting electrocardiographic results", xlab = "", ylab = "Proportion",
col = c("blue", "red", "blue", "red"), beside = TRUE)
legend("topleft", rownames(data), fill = c("blue", "red"), xpd=TRUE, inset=c(0, -.1), horiz = T, bty = "n")

2.4) Bivariate joint p.m.f of the slope of the peak exercise ST segment and target along with grouped bar plot
cross_tab_proportion(heart$slope,heart$target)
## healthy sick total
## down 0.03040541 0.04054054 0.07094595
## flat 0.16216216 0.30067568 0.46283784
## up 0.34797297 0.11824324 0.46621622
## total 0.54054054 0.45945946 1.00000000
# Grouped bar plot
data <- prop.table(table(heart$target,heart$slope))
barplot(data, main = "Slope of the peak exercise ST segment", xlab = "", ylab = "Proportion",
col = c("blue", "red", "blue", "red"), beside = TRUE)
legend("topleft", rownames(data), fill = c("blue", "red"), xpd=TRUE, inset=c(0, -.1), horiz = T, bty = "n")

2.5) Bivariate joint p.m.f of thalesemia blood disorder and target along with grouped bar plot
cross_tab_proportion(heart$thal,heart$target)
## healthy sick total
## fix 0.02027027 0.04054054 0.06081081
## norm 0.42905405 0.12162162 0.55067568
## rev 0.09121622 0.29729730 0.38851351
## total 0.54054054 0.45945946 1.00000000
# Grouped bar plot
data <- prop.table(table(heart$target,heart$thal))
barplot(data, main = "Thal", xlab = "", ylab = "Proportion",
col = c("blue", "red", "blue", "red"), beside = TRUE)
legend("topleft", rownames(data), fill = c("blue", "red"), xpd=TRUE, inset=c(0, -.1), horiz = T, bty = "n")

2.6) Bivariate joint p.m.f of the number of major vessels and target along with grouped bar plot
cross_tab_proportion(heart$ca,heart$target)
## healthy sick total
## 0 0.43581081 0.14864865 0.58445946
## 1 0.07094595 0.14864865 0.21959459
## 2 0.02364865 0.10472973 0.12837838
## 3 0.01013514 0.05743243 0.06756757
## total 0.54054054 0.45945946 1.00000000
# Grouped bar plot
data <- prop.table(table(heart$target,heart$ca))
barplot(data, main = "Proportion of major vessels", xlab = "", ylab = "Proportion",
col = c("blue", "red", "blue", "red"), beside = TRUE)
legend("topright", rownames(data), fill = c("blue", "red"), xpd=TRUE, inset=c(0, -.1), horiz = T, bty = "n")

3) \(\chi^2\) test: Testing association between “target” and other discrete variables
# Function for cross tabulation (count)
cross_tab_count <- function(x,y)
{
temp_table <- table(x,y)
temp_col <- colSums(temp_table)
temp_row <- rowSums(temp_table)
temp_sum <- sum(temp_table)
temp <- as.data.frame(rbind(cbind(temp_table,temp_row),c(temp_col,temp_sum)))
rownames(temp)[nrow(temp)] <- "total"
colnames(temp)[ncol(temp)] <- "total"
return(temp)
}
3.1) Testing association between gender and target
cross_tab_count(heart$sex,heart$target)
## healthy sick total
## fem 71 24 95
## male 89 112 201
## total 160 136 296
# Chi-squared test
chisq.test(heart$sex,heart$target,correct = F)
##
## Pearson's Chi-squared test
##
## data: heart$sex and heart$target
## X-squared = 24.097, df = 1, p-value = 0.0000009161
3.2) Testing association between chest pain and target
cross_tab_count(heart$cp,heart$target)
## healthy sick total
## abnang 40 9 49
## angina 16 7 23
## asympt 39 102 141
## notang 65 18 83
## total 160 136 296
# Chi-squared test
chisq.test(heart$cp,heart$target)
##
## Pearson's Chi-squared test
##
## data: heart$cp and heart$target
## X-squared = 76.454, df = 3, p-value < 0.00000000000000022
3.3) Testing association between resting ECG and target
cross_tab_count(heart$restecg,heart$target)
## healthy sick total
## abn 1 3 4
## hyp 67 78 145
## norm 92 55 147
## total 160 136 296
# Chi-squared tests
chisq.test(heart$restecg,heart$target)
##
## Pearson's Chi-squared test
##
## data: heart$restecg and heart$target
## X-squared = 9.2624, df = 2, p-value = 0.009743
3.4) Testing association between the slope of the peak exercise ST segment and target
cross_tab_count(heart$slope,heart$target)
## healthy sick total
## down 9 12 21
## flat 48 89 137
## up 103 35 138
## total 160 136 296
# Chi-squared test
chisq.test(heart$target,heart$slope)
##
## Pearson's Chi-squared test
##
## data: heart$target and heart$slope
## X-squared = 44.553, df = 2, p-value = 0.0000000002116
3.5) Testing association between thalsemia blood disorder and target
cross_tab_count(heart$thal,heart$target)
## healthy sick total
## fix 6 12 18
## norm 127 36 163
## rev 27 88 115
## total 160 136 296
# Chi-squared test
chisq.test(heart$thal,heart$target)
##
## Pearson's Chi-squared test
##
## data: heart$thal and heart$target
## X-squared = 83.765, df = 2, p-value < 0.00000000000000022
3.6) Testing association between the number of major vessels and target
cross_tab_count(heart$ca,heart$target)
## healthy sick total
## 0 129 44 173
## 1 21 44 65
## 2 7 31 38
## 3 3 17 20
## total 160 136 296
# Chi-squared test
chisq.test(heart$ca,heart$target)
##
## Pearson's Chi-squared test
##
## data: heart$ca and heart$target
## X-squared = 73.396, df = 3, p-value = 0.0000000000000007996
4) Testing of equality of two proportions
4.1) Testing whether the proportion of sick male and sick female are equal or not
# Spoken Tutorials:
# 19. Data Manipulation using dplyr Package
# 20. More Functions in dplyr Package
# 21. Pipe Operator
sick_female <- as.vector(table(heart %>% filter(target=="sick") %>% select(sex)))[1]
sick_male <- as.vector(table(heart %>% filter(target=="sick") %>% select(sex)))[2]
total_female <- length(unlist(heart %>% select(sex) %>% filter(sex=="fem")))
total_male <- length(unlist(heart %>% select(sex) %>% filter(sex=="male")))
proportion_test <- function(n1,N1,n2,N2)
{
p1 <- n1/N1
p2 <- n2/N2
p <- (n1+n2)/(N1+N2)
se <- sqrt(p*(1-p)*((1/N1)+1/(N2)))
result <- (p1-p2)/se
return(cat("p1:",p1,"p2:",p2,"z-statistic:",result,"p-value:",c(2*(1-pnorm(abs(result))))))
}
proportion_test(sick_female,total_female,sick_male,total_male)
## p1: 0.2526316 p2: 0.5572139 z-statistic: -4.908864 p-value: 0.000000916056
4.2) Testing equality of two proportions by using a built-in R function
sick_female <- as.vector(table(heart %>% filter(target=="sick") %>% select(sex)))[1]
sick_male <- as.vector(table(heart %>% filter(target=="sick") %>% select(sex)))[2]
total_female <- length(unlist(heart %>% select(sex) %>% filter(sex=="fem")))
total_male <- length(unlist(heart %>% select(sex) %>% filter(sex=="male")))
prop.test(x=c(sick_female,sick_male), n=c(total_female,total_male),alternative = "two.sided", correct = FALSE)
##
## 2-sample test for equality of proportions without continuity
## correction
##
## data: c(sick_female, sick_male) out of c(total_female, total_male)
## X-squared = 24.097, df = 1, p-value = 0.0000009161
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.4157135 -0.1934512
## sample estimates:
## prop 1 prop 2
## 0.2526316 0.5572139