module2.knit

Module 2

A solved example of SOM using the Iris dataset

In this module we will apply our knowledge from Module 1 and implement SOM for dimensionality reduction over the Iris dataset [1]. The complete module is divided into 6 parts.

Part 1: Loading necessary libraries
Part 2: Creating required functions
Part 3: Loading data files for SOM implementation
Part 4: Generating the Self-Organizing Map
Part 5: Visualizing results
Part 6: Conclusion

Part 1: Loading necessary libraries

# 1.1) Loading the necessary libraries.
library(dplyr)
library(photobiology)
library(ggplot2)

Part 2: Creating required functions

# 2.1) A function to obtain the sum of squared distance between "x" and "y" vectors.
euclidean_distance <- function(x, y) {
  ret <- sqrt(sum((x - y)^2))
  return(ret)
}

# 2.2) Function to create an SOM grid.
# "x*y" is the number of neurons.
# "p" is the number of columns of the original dataset or the number of input dimensions.
create_grid <- function(x,y,p) {
  ret <- matrix(data = rnorm(x * y * p), nrow = x * y, ncol = p)
  return(ret)
}

# 2.3) Function to decay the radius exponentially over time. 
# "radius" is the initial radius value.
# "current_iteration" represents the current iteration.
# "time_constant" is the time constant.
decay_radius_function <- function(radius, current_iteration, time_constant) {
  ret <- radius * exp(-current_iteration / time_constant)
  return(ret)
}

# 2.4) Function to decay the learning rate.
# "learning_rate" is the current learning rate.
# "current_iteration" is the current iteration.
# "n_iteration" is the number of iterations.
decay_learning_rate <- function(learning_rate, current_iteration, n_iteration) {
  ret <- learning_rate * exp(-current_iteration / n_iteration)
  return(ret)
}

# 2.5) A function to calculate the influence over neighbouring neurons.
# "distance" is the lateral distance.
# "radius" is the current neighbourhood radius.
influence_calculation <- function(distance, radius) {
  ret <- exp(-(distance^2) / (2 * (radius^2)))
  return(ret)
}

# 2.6) A function to obtain the winning neuron.
# "x" is a single row of data.
# "input_grid" is the grid.
BMU <- function(x, input_grid) { 
  distance <- 0
  min_distance <- 10000000 
  min_ind <- -1 
  for (e in 1:nrow(input_grid)) # Iterating through the grid.
  {
    distance <- euclidean_distance(x, input_grid[e, ]) # Calculating euclidean distance.
    if (distance < min_distance) {
      min_distance <- distance # Updating minimum distance for winning neuron.
      min_ind <- e # Updating winning neuron.
    }
  }
  return(min_ind-1) # Returning index of BMU.
}

# 2.7) Optimized BMU implementation.
BMU_Vectorised <- function(x, input_grid) { 
  dist_mtrx=rowSums(sweep(input_grid,2,x)^2) # Calculating the distance of "x" row from all the neurons using matrix operation.
  min_ind=which.min(dist_mtrx) # Finding the location of the neuron associated with the minimum distance.
  return (min_ind-1) # Returning the zero-indexed value of the winning neuron.
}

# 2.8) A function to encapsulate the complete SOM algorithm.
# "x" is the input.
# "input_grid" is the SOM grid that shall get updated with each iteration.
SOM <- function(x, input_grid) {
  breaker <- 0
  n_iteration <- nrow(x) # Defining the number of iterations.
  initial_learning_rate <- 0.5 # Defining initial learning rate.
  initial_radius <- 15 # Defining initial radius.
  time_constant <- n_iteration / log(initial_radius) # Initializing time constant.
  lateral_distance_points=expand.grid(1:sqrt(nrow(input_grid)),1:sqrt(nrow(input_grid))) # Initializing physical locations of neurons to figure out lateral distance.
  rows=sqrt(nrow(input_grid)) # Taking the number of rows as square root of the number of entries in the grid.
  n_epochs=40 # Defining the number of epochs.
  new_radius <- initial_radius
  l <- c()
  for(ne in 1:n_epochs)
  {
    extra <- ((ne-1)*400)
    for (i in 1:n_iteration) # Looping through for training.
    {
      old_grid=input_grid
      curr_i <- extra + i
      sample_input_row <- as.vector(unlist(x[sample(1:nrow(x), size = 1, replace = F), ])) # Selecting random input row from the given dataset.
      new_radius <- decay_radius_function(initial_radius, curr_i, time_constant) # Decaying the radius.
      new_learning_rate <- decay_learning_rate(initial_learning_rate,curr_i, n_iteration) # Decaying the learning rate.
      index_temp <- BMU_Vectorised(sample_input_row, input_grid) # Finding best matching unit for the given input row.
      index_new=c((as.integer(index_temp/rows)+1),(index_temp%%rows)+1) # Converting a 1D co-ordinate to a 2D co-ordinate to find lateral distance on the map.
      lateral_distance=sqrt(abs(rowSums(sweep(lateral_distance_points,2,index_new)^2))) # Finding euclidean distance between the given best matching unit and all units of the map.
      rn=which(lateral_distance<=new_radius) # Finding neurons that are within the radius of the winning neuron.
      inf=influence_calculation(lateral_distance[rn],new_radius) # Calculating the influence of winning neuron on neighbours.
      if(length(rn)!=1) # Updating multiple rows if neighbourhood is large.
      {
        # Again calculating the influence of winning neuron on neighbours.
        diff_grid=(sweep(input_grid[rn,],2,sample_input_row))*-1 # Creating a temporary matrix to store the difference between the given data point and the weights of the winning neuron & its neighbours.
        updated_weights=new_learning_rate*inf*diff_grid # Updating operation performed over the winning and neighbouring neurons.
        input_grid[rn,]=input_grid[rn,]+updated_weights # Updating grid entries that are either the winning neuron or its neighbours.
      }
      else # Updating only winning neuron.
      {
        diff_row=(input_grid[rn,]-sample_input_row)*-1 # Creating a temporary matrix to store the difference between the given data point and the weights of the winning neuron & its neighbours.
        updated_weights=new_learning_rate*inf*diff_row # Updating operation performed over the winning neuron & its neighbours.
        input_grid[rn,]=input_grid[rn,]+updated_weights # Updating grid entries that are either the winning neuron or its neighbours.
      }
      l <- c(l,euclidean_distance(old_grid,input_grid))
      if(isTRUE(all.equal(old_grid,input_grid)))
      {
        breaker <- 1
        break
      }
    }
    if(breaker ==1)
    {
      break
    }
  }
  return(list(input_grid,l)) # Returning the updated SOM weights.
}

Part 3: Loading data files for SOM implementation

# 3.1) Reading data and scaling it.
data<-scale(iris[,-5])
X <- scale(data[, ])
data <- X

# 3.2) Setting seed to obtain reproducible results.
set.seed(222)

# 3.3) Creating the grid of neurons.
grid <- create_grid(5,5,4)

Part 4: Generating the Self-Organizing Map

# 4.1) Generating SOM.
y <- SOM(data,grid)

# 4.2) Transforming the SOM weights to usable form.
gridSOM <- y[1]
gridSOM

## [[1]]
##              [,1]        [,2]       [,3]        [,4]
##  [1,]  1.51979580  0.03237630  1.2644454  1.14230472
##  [2,]  1.24099155 -0.02410680  1.0516778  0.98407574
##  [3,]  0.73293736 -0.29868812  0.7240412  0.60851629
##  [4,]  0.15410923 -0.77490850  0.4535389  0.34719100
##  [5,]  0.01098373 -0.59608748  0.4244492  0.37848457
##  [6,]  1.20915449 -0.09180618  1.0353214  1.03089388
##  [7,]  1.02275478 -0.05746048  0.8055906  0.72338924
##  [8,]  0.63555127 -0.35351906  0.4973977  0.33937501
##  [9,]  0.22881853 -0.72438396  0.2992720  0.11486734
## [10,] -0.19835820 -0.62107698  0.1771841  0.09836969
## [11,]  0.68795681 -0.41771178  0.7460912  0.66425283
## [12,]  0.66586054 -0.34578508  0.5388258  0.37420532
## [13,]  0.26071271 -0.24531347  0.1953997  0.07510772
## [14,] -0.77764817 -0.06289039 -0.7263837 -0.73873748
## [15,] -0.62067355  0.19287913 -0.5233841 -0.56000999
## [16,]  0.20381249 -0.78914720  0.4569668  0.30244892
## [17,]  0.19980266 -0.75748213  0.3117550  0.10648900
## [18,] -0.71927940 -0.11619778 -0.7058482 -0.68951213
## [19,] -1.15931006  0.45744592 -1.2669385 -1.21821706
## [20,] -0.99092608  1.00557473 -1.2819808 -1.20593553
## [21,]  0.09036951 -0.67733186  0.4509914  0.32609800
## [22,] -0.19596576 -0.61017007  0.1541737  0.05987712
## [23,] -0.71114901  0.33176281 -0.7535654 -0.72404885
## [24,] -0.98814935  1.01000002 -1.2860587 -1.21374472
## [25,] -0.74348052  1.75151250 -1.2995359 -1.23497618

l <- y[2]
t=1:length(l[[1]])
plot(t,l[[1]])

l=unlist(l)
l[which.min(unlist(l))]

## [1] 4.591932e-08

The above plot shows the decay in learning rate. The graph points represent the difference in neural network weights between consecutive training iterations. The graph indicates a decay in rapidness with which the SOM updates its weights. As the training of the SOM progresses, the neighborhood radii decreases and the map fixates on finer details.

Part 5: Visualizing results

# 5.1) Creating a function to visualize the SOM results by mapping numeric values to colors.
drawGrid<- function(weight,dimension){
  
  # Converting to a matrix.
  weight<-as.matrix(weight, ncol = ncol(weight))
  
  norm.matrix<-NULL
  
  # Calculating the matrix norm.
  for(i in 1:length(weight[,1])){
    a<-norm(weight[i,], type = "2")
    norm.matrix<-rbind(norm.matrix,a)
  }
  
  # Mapping to the wavelength range from 400 to 700.
  input_start<-min(norm.matrix)
  input_end<-max(norm.matrix)
  output_start<-400
  output_end<-700
  
  # Calculating wavelength on the basis of the matix norm.
  color<-NULL
  for(i in 1:length(norm.matrix)){
    input = norm.matrix[i]
    output = output_start + ((output_end - output_start) / (input_end - input_start)) * (input - input_start)
    color<-rbind(color,output)
  }
  
  # Getting colors (hex values) from wavelength values.
  color.rgb<-w_length2rgb(color)
  
  # Plotting the grid.
  dim<-max(dimension)+1
  plot(1:dim, type = "n")
  
  for (i in 1:dimension[1]) {
    for(j in 1:dimension[2]){
      rect(i,j,i+1,j+1, col = color.rgb[i*dimension[1]+j - dimension[1]])
    }
  }
} 

# 5.2) Plotting the grid of weights.
gridSOM=matrix(unlist(gridSOM),ncol=4)
drawGrid(gridSOM,c(5,5))

Part 6: Conclusion

Dimensionality reduction got successfully executed over the Iris dataset [1] using SOM in R. It resulted in a 2D grid containing three colors [green, blue and red] corresponding to the dataset’s three original clusters, namely, Iris Setosa, Iris Versicolor and Iris Virginica. Effective visualization is an important challenge of the SOM implementation as the weights need to be expressed effectively to show the inherent clusters in the data. The values of different parameters in the code above were obtained after various attempts and gave the best results.

References

[1] Fisher, R. A. (1936) The use of multiple measurements in taxonomic problems. Annals of Eugenics, 7, Part II, 179–188.