library(seqinr) library(protr) library(party) library(BioSeqClass) library(randomForest) library(lme4) library(caret) library(adabag) library(tree) library(rpart) library(DMwR) library(e1071) library(MASS) ### Read in ready-to-go data: DATA = read.csv("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/50_50.csv", header = TRUE, row.names = 1) ### Remove one amino acid from consideration: DATA = DATA[,c(1,3:36)] ### Save character vector of locations: classes = levels(DATA[,1]) ### Split data into subsets, each one containing all proteins from one location: subsets = lapply(classes, function(z){ a = subset(DATA, DATA[,1] == z) }) ### Name each subset by its associated location: names(subsets) = classes ### Create standard vector for each location: standard = lapply(1:length(subsets), function(z){ a = colSums(subsets[[z]][,-1])/nrow(subsets[[z]]) }) ### Name each standard vector by its associated location: names(standard) = classes ### Initialize (16) 34x34 covariance matrices, one for each location: C = vector(mode = "list", length = length(classes)) C = lapply(C, function(z){ return(matrix(0, nrow = 34, ncol = 34)) }) ### For each location, fill-in the values of its covariance matrix: for(z in 1:length(classes)){ for(i in 1:34){ for(j in 1:34){ C[[z]][i,j] = (sum((subsets[[z]][,-1][,i] - as.vector(t(standard[[z]][i])))*(subsets[[z]][,-1][,j] - as.vector(t(standard[[z]][j])))))/(nrow(subsets[[z]]) - 1) } } print(z) } ### Name each covariance matrix by its associated location: names(C) = classes ### Initialize (16) 34x34 inverse covariance matrices, one for each location: C_inv = vector(mode = "list", length = length(classes)) C_inv = lapply(C_inv, function(z){ return(matrix(0, nrow = 34, ncol = 34)) }) ### Save each covariance matrix to machine (for later use in MATLAB): for(i in 1:length(classes)){ write.table(C[[i]], file = paste("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/matrices_for_CDA/matrix", i, ".csv", sep = ""), sep = ",", row.names = FALSE, col.names = FALSE) } ### Try to calculate inverses of covariance matrices in MATLAB ### # M = csvread('c:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/matrices_for_CDA/matrix[i].csv'); % where [i] is in {1,16}, NOT in the brackets # I = (p)inv(M); % this means try taking the inverse, and if can't, then take the pseudoinverse # % although it ended up being the case that pseudoinverses didn't help either # dlmwrite('c:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/matrices_for_CDA/inverse5.csv', I, 'precision', 10); % save covariance matrix ### End of MATLAB stuff ### ### Read in inverse covariance matrices: for(i in 1:length(classes)){ C_inv[[i]] = as.matrix(read.csv(paste("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/matrices_for_CDA/inverse", i, ".csv", sep = ""), header = FALSE)) } ### Initialize (3002) length (16) vectors, one for each protein in the dataset: MHs = vector(mode = "list", length = nrow(DATA)) MHs = lapply(MHs, function(z){ return(vector(mode = "numeric", length = length(classes))) }) ### Name each entry in each MHs vector by its associated location: for(i in 1:length(MHs)){ names(MHs[[i]]) = classes } ### For each protein, calculate the 16 similarity measures that go into its vector: for(i in 1:length(MHs)){ for(j in 1:length(classes)){ MHs[[i]][j] = (unlist(DATA[i,-1] - standard[[j]]))%*%(C_inv[[j]])%*%(unlist(DATA[i,-1] - standard[[j]])) + log(det(C[[j]])) } print(i) } ### Uh oh, for locations 5, 10, 11, 12, 13, and 15, even MATLAB couldn't calculate an inverse ### Pseudoinverses didn't help either, as (Matrix)*(Pseudoinverse) was not giving an identity matrix, or even close ### So for each protein, its similarity measure in location 5, 10, 11, 12, 13, and 15 is useless: ### Thus, to get around this problem, set them as the arbitrary large number 10000000: for(i in 1:length(MHs)){ MHs[[i]][c(5,10:13,15)] = 10000000 } ### Create length (3002) character vector, each entry corresponding to a protein predictions = vector(mode = "character", length = length(MHs)) ### Determine location prediction for each protein by the location associated with its smallest similarity measure value: for(i in 1:length(MHs)){ predictions[i] = names(which.min(MHs[[i]])) } ### Save actual locations of full dataset: actual = DATA[,1] ### Calculate error rate of classification ### (note that we know all proteins actually in locations 5, 10, 11, 12, or 13 will be predicted wrong): error_CDA = 1 - sum(predictions == actual)/nrow(DATA) save.image("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/CDA.RData") ### Note - will need to perform a "work-around" to ensure that the factor ### levels for the predictions are the same as those for the actual data. ### By default, will only have 10 levels, NOT 16, since 6 locations ### never get predicted for. Work-around is as follows: ### Save real predictions for first 6 proteins: first_6 = predictions[1:6] ### Save 6 fake predictions, one for each location that never gets predicted: fake_6 = levels(actual)[c(5,10:13,15)] ### Set predictions for first 6 proteins as the fake ones: predictions[1:6] = fake_6 ### Make predictions a factor - this will have the 16 needed levels: predictions = as.factor(predictions) ### Put correct predictions back in for the first 6 proteins ### Number of levels will stay at 16 like we need! ### (Probably exists a better work-around: let me know!): predictions[1:6] = first_6 ### Create confusion matrix and performance statistics for each location: confusion_CDA = confusionMatrix(data = predictions, reference = actual) ### Save CDA results to machine: write.csv(as.data.frame.matrix(confusion_CDA$table), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/CDA_confusion.csv") write.csv(as.data.frame.matrix(confusion_CDA$byClass), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/CDA_stats.csv") write.table(error_CDA, file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/CDA_stats.csv", append = TRUE) ### Save R workspace: save.image("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/CDA.RData")