# Wine data:
# Contains data on concentrations of 13 different chemicals
# in wines grown in the same region in Italy
# that are derived from three different cultivars.
# Data file structure:
# There is one row per wine sample.
# The first column contains the cultivar of a wine sample (labelled 1, 2 or 3).
# The following thirteen columns contain the concentrations
# of the 13 different chemicals in that sample.
# The columns are separated by commas and there are data on 178 samples.
# This data can be accessed in two different methods.
###########################################################################
# Method-1: From original source using the following commands (in R software)
wine <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", sep=",")
# We will have to add the name of the data variables manually
colnames(wine) <- c("Cvs","Alcohol","Malic acid","Ash","Alcalinity of ash", "Magnesium", "Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins", "Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline")
# Method-2:
# Download the data file from UoH website using the following link.
# Store this file on desktop
# Inside R change directory to Desktop
# Read the CSV file wine_data.csv
# Note the file already contains the variable names on top as column header
wine <- read.csv("wine_data.csv", header=T)
###########################################################################
# Visulaize the data (partially, since there are 13 variables)
## If car library is already installed within R on the PC there then
# library(car)
# scatterplotMatrix(wine[2:6])
# Else use plot or pairs
plot(wine[2:6])
pairs(wine[,-1], col = wine[,1], upper.panel = NULL, pch = 16, cex = 0.5)
legend("topright", bty = "n", legend = c("Cv1","Cv2","Cv3"), pch = 10, col = c("black","red","green"), xpd = T, cex = 1, y.intersp = 0.5)
###########################################################################
# PCA using prcomp
# There are other methods/options available in R for PCA
wine.pca1 <- prcomp(scale(wine[,-1]))
summary(wine.pca1)
print(wine.pca1)
predict(wine.pca1)
# The scree plot displays the number of the principal component
# versus its corresponding eigenvalue.
# The scree plot orders the eigenvalues from largest to smallest.
> screeplot(wine.pca1, type="lines")
# The standard deviation of the components
wine.pca1$sdev
# The total variance explained by the components = sum of the variances
sum((wine.pca1$sdev)^2)
# The loadings for the principal components
wine.pca1$rotation[,1]
# A biplot displays the data points along two principal components
# the first and second components, by default.
# Arrows indicate the contributions of each variable to these components.
# The graphs can get messy if there are too many variables.
biplot(wine.pca1, cex=0.7)
# To visualize other components, set the choices option.
# For example, to plot the 3rd and 4th principal components, use
biplot(wine.pca1, cex=0.7, choices=c(3,4))
# Now try PCA without scaling the data
# Compare the results obtained with scaling
wine.pca2 <- prcomp(wine[,-1], scale. = FALSE )
###########################################################################
# MDS: Multidimensional scaaling
# Classical/Metric MDS
d <- dist(wine[, -1]) # euclidean distances between the rows
wine.mds <- cmdscale(d,eig=TRUE, k=2) # k is the number of dim
wine.mds # view results, output could be long
###########################################################################
# LDA: Linear Discriminant Analysis
library("MASS")
wine.lda <-lda( wine[,1] ~ wine[,2]+ wine[,3]+ wine[,4]+ wine[,5]+ wine[,6]+ wine[,7]+ wine[,8]+ wine[,9]+ wine[,10]+ wine[,11]+ wine[,12]+ wine[,13]+ wine[,14])
# Loadings for the Discriminant Functions:
wine.lda
# Scalings: the loadings for each discriminant function
wine.lda$scaling[,1]
# The “proportion of trace” is the percentage separation achieved
# by each discriminant function.
# A Stacked Histogram of the LDA Values.
# The values of the discriminant function for different groups
wine.lda.values <- predict(wine.lda, wine[, -1])
ldahist(data = wine.lda.values$x[,1], g=wine[,1])
ldahist(data = wine.lda.values$x[,2], g=wine[,1])
###########################################################################
# Comparative plot of three methods
# To save the plot in a JPEG file
jpeg("PCA_MDS_LDA_output.jpg)
# To partition output window in multiple-frames for multiple plots
par(mfrow=c(2,2))
# PCA: Scatterplot of the first two principal components
plot(wine.pca1$x[,1:2], col = wine[,1], xlab="PC 1", ylab="PC 2", main="PCA")
# MDS: plot solution
x <- wine.mds $points[,1]
y <- wine.mds $points[,2]
plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2", main="MDS", col=wine[,1])
# LDA:
x2 <- wine.lda.values$x[,1]
y2 <- wine.lda.values$x[,2]
plot(x2, y2, xlab="Discriminant 1", ylab=" Discriminant 2", main="LDA", col=wine[,1])
# To close the plot/output window/device
dev.off()
###########################################################################
# To quit/exit R
q()