library(seqinr)
library(protr)
library(party)
library(BioSeqClass)
library(randomForest)
library(lme4)
library(caret)
library(adabag)
library(tree)
library(rpart)
library(DMwR)
library(e1071)
library(MASS)
### Read in ready-to-go data:
DATA = read.csv("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/50_50.csv", header = TRUE, row.names = 1)
### Remove one amino acid from consideration:
DATA = DATA[,c(1,3:36)]
### Save character vector of locations:
classes = levels(DATA[,1])
### Split data into subsets, each one containing all proteins from one location:
subsets = lapply(classes, function(z){
  a = subset(DATA, DATA[,1] == z)
})
### Name each subset by its associated location:
names(subsets) = classes
### Create standard vector for each location:
standard = lapply(1:length(subsets), function(z){
  a = colSums(subsets[[z]][,-1])/nrow(subsets[[z]])
})
### Name each standard vector by its associated location:
names(standard) = classes
### Initialize (16) 34x34 covariance matrices, one for each location:
C = vector(mode = "list", length = length(classes))
C = lapply(C, function(z){
  return(matrix(0, nrow = 34, ncol = 34))
})
### For each location, fill-in the values of its covariance matrix:
for(z in 1:length(classes)){
  for(i in 1:34){
    for(j in 1:34){
      C[[z]][i,j] = (sum((subsets[[z]][,-1][,i] - as.vector(t(standard[[z]][i])))*(subsets[[z]][,-1][,j] - as.vector(t(standard[[z]][j])))))/(nrow(subsets[[z]]) - 1)
    }
  }
  print(z)
}
### Name each covariance matrix by its associated location:
names(C) = classes
### Initialize (16) 34x34 inverse covariance matrices, one for each location:
C_inv = vector(mode = "list", length = length(classes))
C_inv = lapply(C_inv, function(z){
  return(matrix(0, nrow = 34, ncol = 34))
})
### Save each covariance matrix to machine (for later use in MATLAB):
for(i in 1:length(classes)){
  write.table(C[[i]], file = paste("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/matrices_for_CDA/matrix", i, ".csv", sep = ""), sep = ",", row.names = FALSE, col.names = FALSE)
}
### Try to calculate inverses of covariance matrices in MATLAB ###
# M = csvread('c:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/matrices_for_CDA/matrix[i].csv'); % where [i] is in {1,16}, NOT in the brackets
# I = (p)inv(M); % this means try taking the inverse, and if can't, then take the pseudoinverse
# % although it ended up being the case that pseudoinverses didn't help either
# dlmwrite('c:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/matrices_for_CDA/inverse5.csv', I, 'precision', 10); % save covariance matrix
### End of MATLAB stuff ###
### Read in inverse covariance matrices:
for(i in 1:length(classes)){
  C_inv[[i]] = as.matrix(read.csv(paste("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/matrices_for_CDA/inverse", i, ".csv", sep = ""), header = FALSE))
}
### Initialize (3002) length (16) vectors, one for each protein in the dataset:
MHs = vector(mode = "list", length = nrow(DATA))
MHs = lapply(MHs, function(z){
  return(vector(mode = "numeric", length = length(classes)))
})
### Name each entry in each MHs vector by its associated location:
for(i in 1:length(MHs)){
  names(MHs[[i]]) = classes
}
### For each protein, calculate the 16 similarity measures that go into its vector:
for(i in 1:length(MHs)){
  for(j in 1:length(classes)){
    MHs[[i]][j] = (unlist(DATA[i,-1] - standard[[j]]))%*%(C_inv[[j]])%*%(unlist(DATA[i,-1] - standard[[j]])) + log(det(C[[j]]))
  }
  print(i)
}
### Uh oh, for locations 5, 10, 11, 12, 13, and 15, even MATLAB couldn't calculate an inverse
### Pseudoinverses didn't help either, as (Matrix)*(Pseudoinverse) was not giving an identity matrix, or even close
### So for each protein, its similarity measure in location 5, 10, 11, 12, 13, and 15 is useless:
### Thus, to get around this problem, set them as the arbitrary large number 10000000:
for(i in 1:length(MHs)){
  MHs[[i]][c(5,10:13,15)] = 10000000
}
### Create length (3002) character vector, each entry corresponding to a protein
predictions = vector(mode = "character", length = length(MHs))
### Determine location prediction for each protein by the location associated with its smallest similarity measure value:
for(i in 1:length(MHs)){
  predictions[i] = names(which.min(MHs[[i]]))
}
### Save actual locations of full dataset:
actual = DATA[,1]
### Calculate error rate of classification
### (note that we know all proteins actually in locations 5, 10, 11, 12, or 13 will be predicted wrong):
error_CDA = 1 - sum(predictions == actual)/nrow(DATA)
save.image("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/CDA.RData")
### Note - will need to perform a "work-around" to ensure that the factor
### levels for the predictions are the same as those for the actual data.
### By default, will only have 10 levels, NOT 16, since 6 locations
### never get predicted for. Work-around is as follows:
### Save real predictions for first 6 proteins:
first_6 = predictions[1:6]
### Save 6 fake predictions, one for each location that never gets predicted:
fake_6 = levels(actual)[c(5,10:13,15)]
### Set predictions for first 6 proteins as the fake ones:
predictions[1:6] = fake_6
### Make predictions a factor - this will have the 16 needed levels:
predictions = as.factor(predictions)
### Put correct predictions back in for the first 6 proteins
### Number of levels will stay at 16 like we need!
### (Probably exists a better work-around: let me know!):
predictions[1:6] = first_6
### Create confusion matrix and performance statistics for each location:
confusion_CDA = confusionMatrix(data = predictions, reference = actual)
### Save CDA results to machine:
write.csv(as.data.frame.matrix(confusion_CDA$table), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/CDA_confusion.csv")
write.csv(as.data.frame.matrix(confusion_CDA$byClass), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/CDA_stats.csv")
write.table(error_CDA, file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/CDA_stats.csv", append = TRUE)
### Save R workspace:
save.image("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/CDA.RData")