# Wine data: 
# Contains data on concentrations of 13 different chemicals 
# in wines grown in the same region in Italy 
# that are derived from three different cultivars. 


# Data file structure:
# There is one row per wine sample. 
# The first column contains the cultivar of a wine sample (labelled 1, 2 or 3).
# The following thirteen columns contain the concentrations 
# of the 13 different chemicals in that sample. 


# The columns are separated by commas and there are data on 178 samples.
# This data can be accessed in two different methods.


###########################################################################


# Method-1: From original source using the following commands (in R software)

wine <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", sep=",")

# We will have to add the name of the data variables manually

colnames(wine) <- c("Cvs","Alcohol","Malic acid","Ash","Alcalinity of ash", "Magnesium", "Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins", "Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline")

# Method-2: 
# Download the data file from UoH website using the following link. 
# Store this file on desktop
# Inside R change directory to Desktop
# Read the CSV file wine_data.csv
# Note the file already contains the variable names on top as column header

wine <- read.csv("wine_data.csv", header=T)


###########################################################################


# Visulaize the data (partially, since there are 13 variables)
 
## If car library is already installed within R on the PC there then
# library(car)
# scatterplotMatrix(wine[2:6])

# Else use plot or pairs
plot(wine[2:6])
pairs(wine[,-1], col = wine[,1], upper.panel = NULL, pch = 16, cex = 0.5)
legend("topright", bty = "n", legend = c("Cv1","Cv2","Cv3"), pch = 10, col = c("black","red","green"), xpd = T, cex = 1, y.intersp = 0.5)


###########################################################################


# PCA using prcomp
# There are other methods/options available in R for PCA
wine.pca1 <- prcomp(scale(wine[,-1]))

summary(wine.pca1)
print(wine.pca1)
predict(wine.pca1)

# The scree plot displays the number of the principal component 
# versus its corresponding eigenvalue. 
# The scree plot orders the eigenvalues from largest to smallest.
> screeplot(wine.pca1, type="lines")

# The standard deviation of the components 
wine.pca1$sdev

# The total variance explained by the components = sum of the variances 
sum((wine.pca1$sdev)^2)

# The loadings for the principal components 
wine.pca1$rotation[,1]

# A biplot displays the data points along two principal components 
# the first and second components, by default.
# Arrows indicate the contributions of each variable to these components. 
# The graphs can get messy if there are too many variables. 
biplot(wine.pca1, cex=0.7)

# To visualize other components, set the choices option. 
# For example, to plot the 3rd and 4th principal components, use
biplot(wine.pca1, cex=0.7, choices=c(3,4))


# Now try PCA without scaling the data
# Compare the results obtained with scaling
wine.pca2 <- prcomp(wine[,-1], scale. = FALSE ) 


###########################################################################


# MDS: Multidimensional scaaling
# Classical/Metric MDS

d <- dist(wine[, -1]) 		# euclidean distances between the rows
wine.mds <- cmdscale(d,eig=TRUE, k=2) # k is the number of dim
wine.mds 			# view results, output could be long


###########################################################################


# LDA: Linear Discriminant Analysis 
library("MASS") 

wine.lda <-lda( wine[,1] ~ wine[,2]+ wine[,3]+ wine[,4]+ wine[,5]+ wine[,6]+ wine[,7]+ wine[,8]+ wine[,9]+ wine[,10]+ wine[,11]+ wine[,12]+ wine[,13]+ wine[,14])

# Loadings for the Discriminant Functions: 
wine.lda

# Scalings: the loadings for each discriminant function
wine.lda$scaling[,1]

# The “proportion of trace” is the percentage separation achieved 
# by each discriminant function.

# A Stacked Histogram of the LDA Values.
# The values of the discriminant function for different groups 
wine.lda.values <- predict(wine.lda, wine[, -1])
ldahist(data = wine.lda.values$x[,1], g=wine[,1])
ldahist(data = wine.lda.values$x[,2], g=wine[,1])


###########################################################################


# Comparative plot of three methods

# To save the plot in a JPEG file
jpeg("PCA_MDS_LDA_output.jpg)


# To partition output window in multiple-frames for multiple plots
par(mfrow=c(2,2))

# PCA: Scatterplot of the first two principal components 
plot(wine.pca1$x[,1:2], col = wine[,1], xlab="PC 1", ylab="PC 2", main="PCA")

# MDS: plot solution 
x <- wine.mds $points[,1]
y <- wine.mds $points[,2]
plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2", main="MDS", col=wine[,1])


# LDA: 
x2 <- wine.lda.values$x[,1]
y2 <- wine.lda.values$x[,2]
plot(x2, y2, xlab="Discriminant 1", ylab=" Discriminant 2", main="LDA", col=wine[,1])


# To close the plot/output window/device
dev.off() 


###########################################################################


# To quit/exit R
q()