load("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/MIXTURE.RData")
library(caret)
### Only need to consider numbers one less than each number of classes,
### and also each number of classes (so can see the increase/decrease each increment):
nums = c(nobs - 1, nobs)
### Sory by increasing:
nums = nums[order(nums)]
### Obtain predictions based on if/else statement
### Also, if location is one of the six that CDA fails on,
### use SVMs no matter what so there is a chance of good predictions:
predictions = lapply(nums, function(z){
  a = vector(mode = "character", length = 3002)
  for(i in 1:3002){
    if((nobs[which(ACTUAL[[i]] == names(nobs))] <= z) & (is.element(which(ACTUAL[[i]] == names(nobs)), c(5, 10, 11, 12, 13, 15))) == FALSE){
      a[i] = as.character(CDA[i])
    }
    else{a[i] = as.character(svms[i])}
  }
  return(a)
})
### Make predictions factors, not characters:
new_nums = 1:(2*length(nobs))
predictions = lapply(new_nums, function(z){
  return(as.factor(predictions[[z]]))
})
### SAME WORK-AROUND IDEA SEEN BEFORE TO ENSURE 16 FACTOR LEVELS:
predictions = lapply(new_nums, function(z){
  predictions[[z]] = as.character(predictions[[z]])
  ### Save real predictions for first 16 proteins:
  first_16 = predictions[[z]][1:16]
  ### Save 16 fake predictions, one for each location that never gets predicted:
  fake_16 = levels(ACTUAL)
  ### Set predictions for first 16 proteins as the fake ones:
  predictions[[z]][1:16] = fake_16
  ### Make predictions a factor - this will have the 16 needed levels:
  predictions[[z]] = as.factor(predictions[[z]])
  ### Put correct predictions back in for the first 16 proteins
  ### Number of levels will stay at 16 like we need!
  ### (Probably exists a better work-around: let me know!):
  predictions[[z]][1:16] = first_16
  return(predictions[[z]])
})
### Save confusion matrices and stats:
confusions = lapply(new_nums, function(z){
  return(confusionMatrix(data = predictions[[z]], reference = ACTUAL))
})
### Save classification accuracies:
accuracies = lapply(new_nums, function(z){
  return(confusions[[z]]$overall[1])
})
### Make a vector, not a list:
accuracies = unlist(accuracies)
### Give appropriate names:
names(accuracies) = as.character(nums)
### Scatterplot of nums and accuracies:
plot(nums, accuracies)
### Get indices of best (should be two of them):
indices = which(accuracies == max(accuracies))
### Best values and classification accuracies:
best = accuracies[indices]
### Take average of the two:
average = (as.numeric(names(best)[1]) + as.numeric(names(best)[2]))/2
### Save best confusion matrices and stats (are both the same):
best_confusions = list(confusions[[indices[1]]], confusions[[indices[2]]])
### Save workspace to machine:
save.image("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/MIXTURE.RData")
### Save results to machine:
write.csv(as.data.frame.matrix(best_confusions[[1]]$table), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/mixture_confusion.csv")
write.csv(as.data.frame.matrix(round((best_confusions[[1]]$byClass), digits = 2)), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/mixture_stats_round.csv")
### Set c as a confusion matrix, previously calculated and stored
### (this confusion matrix has rows as predictions and columns as actuals):
c = as.matrix(as.data.frame.matrix(best_confusions[[1]]$table))
### Initialize length 16 MCC vector:
MCC = vector(mode = "numeric", length = 16)
### For each location:
for(k in 1:16){
  A = as.double(c[k,k]) ### A = #(true positives)
  B = as.double(sum(c[k,]) - c[k,k]) ### B = #(false positives)
  C = as.double(sum(c[,k]) - c[k,k]) ### C = #(false negatives)
  D = as.double(sum(c) - A - B - C) ### D = #(true negatives)
  if((((A + B)*(A + C)*(D + B)*(D + C))^.5) == 0){ ### if denominator is zero, arbitrarily set as one - MCC will come out to be zero
    MCC[k] = (A*D) - (B*C)
  }
  else{MCC[k] = ((A*D) - (B*C))/(((A + B)*(A + C)*(D + B)*(D + C))^.5)} ### MCC value by formula
  rm(A,B,C,D)
}
### Rounding, setting as column vector:
MCC = round(matrix(MCC, nrow = 16, ncol = 1), digits = 2)
### Set appropriate column name:
colnames(MCC) = "MCC"
### Save results to machine (to be appended to pre-existing statistics by class table):
write.table(MCC, file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/mixture_MCCs.csv", sep = ",", row.names = FALSE, col.names = TRUE)
rm(c, MCC)