# Wine data: # Contains data on concentrations of 13 different chemicals # in wines grown in the same region in Italy # that are derived from three different cultivars. # Data file structure: # There is one row per wine sample. # The first column contains the cultivar of a wine sample (labelled 1, 2 or 3). # The following thirteen columns contain the concentrations # of the 13 different chemicals in that sample. # The columns are separated by commas and there are data on 178 samples. # This data can be accessed in two different methods. ########################################################################### # Method-1: From original source using the following commands (in R software) wine <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", sep=",") # We will have to add the name of the data variables manually colnames(wine) <- c("Cvs","Alcohol","Malic acid","Ash","Alcalinity of ash", "Magnesium", "Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins", "Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline") # Method-2: # Download the data file from UoH website using the following link. # Store this file on desktop # Inside R change directory to Desktop # Read the CSV file wine_data.csv # Note the file already contains the variable names on top as column header wine <- read.csv("wine_data.csv", header=T) ########################################################################### # Visulaize the data (partially, since there are 13 variables) ## If car library is already installed within R on the PC there then # library(car) # scatterplotMatrix(wine[2:6]) # Else use plot or pairs plot(wine[2:6]) pairs(wine[,-1], col = wine[,1], upper.panel = NULL, pch = 16, cex = 0.5) legend("topright", bty = "n", legend = c("Cv1","Cv2","Cv3"), pch = 10, col = c("black","red","green"), xpd = T, cex = 1, y.intersp = 0.5) ########################################################################### # PCA using prcomp # There are other methods/options available in R for PCA wine.pca1 <- prcomp(scale(wine[,-1])) summary(wine.pca1) print(wine.pca1) predict(wine.pca1) # The scree plot displays the number of the principal component # versus its corresponding eigenvalue. # The scree plot orders the eigenvalues from largest to smallest. > screeplot(wine.pca1, type="lines") # The standard deviation of the components wine.pca1$sdev # The total variance explained by the components = sum of the variances sum((wine.pca1$sdev)^2) # The loadings for the principal components wine.pca1$rotation[,1] # A biplot displays the data points along two principal components # the first and second components, by default. # Arrows indicate the contributions of each variable to these components. # The graphs can get messy if there are too many variables. biplot(wine.pca1, cex=0.7) # To visualize other components, set the choices option. # For example, to plot the 3rd and 4th principal components, use biplot(wine.pca1, cex=0.7, choices=c(3,4)) # Now try PCA without scaling the data # Compare the results obtained with scaling wine.pca2 <- prcomp(wine[,-1], scale. = FALSE ) ########################################################################### # MDS: Multidimensional scaaling # Classical/Metric MDS d <- dist(wine[, -1]) # euclidean distances between the rows wine.mds <- cmdscale(d,eig=TRUE, k=2) # k is the number of dim wine.mds # view results, output could be long ########################################################################### # LDA: Linear Discriminant Analysis library("MASS") wine.lda <-lda( wine[,1] ~ wine[,2]+ wine[,3]+ wine[,4]+ wine[,5]+ wine[,6]+ wine[,7]+ wine[,8]+ wine[,9]+ wine[,10]+ wine[,11]+ wine[,12]+ wine[,13]+ wine[,14]) # Loadings for the Discriminant Functions: wine.lda # Scalings: the loadings for each discriminant function wine.lda$scaling[,1] # The “proportion of trace” is the percentage separation achieved # by each discriminant function. # A Stacked Histogram of the LDA Values. # The values of the discriminant function for different groups wine.lda.values <- predict(wine.lda, wine[, -1]) ldahist(data = wine.lda.values$x[,1], g=wine[,1]) ldahist(data = wine.lda.values$x[,2], g=wine[,1]) ########################################################################### # Comparative plot of three methods # To save the plot in a JPEG file jpeg("PCA_MDS_LDA_output.jpg) # To partition output window in multiple-frames for multiple plots par(mfrow=c(2,2)) # PCA: Scatterplot of the first two principal components plot(wine.pca1$x[,1:2], col = wine[,1], xlab="PC 1", ylab="PC 2", main="PCA") # MDS: plot solution x <- wine.mds $points[,1] y <- wine.mds $points[,2] plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2", main="MDS", col=wine[,1]) # LDA: x2 <- wine.lda.values$x[,1] y2 <- wine.lda.values$x[,2] plot(x2, y2, xlab="Discriminant 1", ylab=" Discriminant 2", main="LDA", col=wine[,1]) # To close the plot/output window/device dev.off() ########################################################################### # To quit/exit R q()