### Warning: the reader may find some varible names strange

### First - install packages with install.packages()
### Note - installing BioSeqClass may be more difficult than just using install.packages("BioSeqClass")
### See http://www.bioconductor.org/packages/release/bioc/html/BioSeqClass.html for potential help
### Load packages - most of the following are used at some point:
library(seqinr)
library(protr)
library(party)
library(BioSeqClass)
library(randomForest)
library(lme4)
library(caret)
library(adabag)
library(tree)
library(rpart)
library(DMwR)
### Read in protein data, already mostly subsetted properly using online database
### (readFASTA may not work; may have to use read.fasta (they're from different packages)):
#stuff = readFASTA("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/benchmark.fasta", seqonly = FALSE)
stuff = read.fasta("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/benchmark.fasta", seqtype = "AA", as.string = TRUE)
vector = as.vector(stuff)
### Remove any proteins containing an X (unknown amino acid) in their sequences:
tacos = NULL
for(i in 1:length(vector)){
  if(grepl("X", vector[i]) == TRUE){
    tacos = c(tacos, i)
  }
}
vector = vector[-tacos]
### Shorten protein names to just their length 6 identifiers:
names(vector) = sapply(names(vector), function(x){
  substring(x, 4, 9)
})
### Set a seed for reproducability (since random sampling is coming up):
set.seed(1349)
### Read in output text file from "50/50" BLASTClust, where the one or more proteins on each line represent a specific cluster:
scan = scan("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/blasted.txt", what = character(), sep = "\n")
### Create an R list, each list element a cluster:
list = list()
for(i in 1:length(scan)){
  list[[i]] = unlist(strsplit(scan[i], " "))
}
### Shorten protein names to just their length 6 identifiers:
for(i in 1:length(list)){
  for(j in 1:length(list[[i]])){
    list[[i]][j] = substring(list[[i]][j], 4, 9)
  }
}
### For each cluster in the list, randomly select just one protein from the cluster:
proteins = NULL
for(i in 1:length(list)){
  proteins[i] = list[[i]][sample(1:length(list[[i]]), 1)]
}
### Remove proteins that weren't previously selected:
blasted_outta_here = NULL
for(i in 1:length(vector)){
  if((names(vector)[i] %in% proteins) == FALSE){
    blasted_outta_here = c(blasted_outta_here, i)
  }
}
vector = vector[-blasted_outta_here]
### Read in list of known protein locations:
locations = read.csv("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/locations_unique.csv", header = FALSE, col.names = c("Protein", "Location"))
### Transform protein sequences by means of Pseudo Amino Acid Composition:
PAA = featurePseudoAAComp(vector, d = 15, w = 0.05)
### Save results to machine:
write.csv(PAA, "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/50_50.csv")
### Read results back in:
PAA = read.csv("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/50_50.csv", header = TRUE, row.names = 1)
### Reorder data alphabetically by protein identifier:
PAA = PAA[order(row.names(PAA)),]
### Remove location listings for proteins that aren't now in our dataset:
subcellular = rep("word", nrow(PAA))
a = NULL
for (i in 1:nrow(locations)){
  if (locations[i,1] %in% rownames(PAA) == FALSE){
    a = c(a,i)
  }
}
locations = locations[-a,]
### Combine known locations and the PAA values into a data frame:
subcellular = as.factor(as.character(locations$Location))
PAA = data.frame(subcellular, PAA)
### Save data frame to machine:
write.csv(PAA, "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/50_50.csv")
### End of preprocessing
###############################################################################################
### Beginning of analysis
### "Refresh" seed - not really necessary:
set.seed(0)
### Read data back in:
DATA = read.csv("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/50_50.csv", header = TRUE, row.names = 1)
### Set a seed for reproducability (since random sampling is coming up):
set.seed(69)
### Take a sample of 70% of the numbers in 1 through the number of proteins in the dataset:
nums = sample(1:nrow(DATA), floor(.7*nrow(DATA)))
### Create 70% training set (and ensure that at least one protein from each location is in it
### (need to do this so models can know that each location is a "possibility"):
train = DATA[nums,]
while (0 %in% summary(train$subcellular) == TRUE){
  nums = sample(1:nrow(DATA), floor(.7*nrow(DATA)))
  train = DATA[nums,]
}
### Create 30% testing set:
test = DATA[-nums,]
### Random Forests
### Create Random Forest model with training data - 500 trees is default:
RF = randomForest(formula = subcellular ~ ., data = train, importance = TRUE, proximity = TRUE)
### Variable importance plot:
varImp_RF = varImpPlot(RF)
### Obtain predictions for testing data:
predicted_RF = predict(RF, newdata = test)
### Save actual locations of testing data:
actual = test[,1]
### Calculate error rate of classification:
error_RF = 1 - sum(predicted_RF == actual)/nrow(test)
### Create confusion matrix and performance statistics for each location:
confusion_RF = confusionMatrix(data = predicted_RF, reference = actual)
### Adaptive Boosting
### Set some "appropriate" parameter options for the algorithm:
cntrl = rpart.control(maxdepth = 6, minsplit = 0, cp = -1)
### Create adaptive boosting model with training data, using 500 trees:
ada = boosting(formula = subcellular ~ ., data = train, mfinal = 500, coeflearn = "Freund", boos = TRUE, control = cntrl)
### Variable importance plot:
varImp_ada = barplot(ada$importance[order(ada$importance, decreasing = TRUE)], ylim = c(0,20), main = "Variables Relative Importance", col = "lightblue")
### Obtain predictions for testing data:
predicted_ada = predict(object = ada, newdata = test, newmfinal = 500)
### Calculate error rate of classification:
error_ada = 1 - sum(predicted_ada$class == actual)/nrow(test)
### Create confusion matrix and performance statistics for each location:
predicted_ada_class = as.factor(predicted_ada$class)
levels(predicted_ada_class) = c(levels(predicted_ada_class), levels(actual))
predicted_ada_class = factor(predicted_ada_class, levels(predicted_ada_class)[order(levels(predicted_ada_class))])
confusion_ada = confusionMatrix(data = predicted_ada_class, reference = actual)
### Adaptive Boosting with ten-fold cross-validation
### Create adaptive boosting with ten-fold cross-validation model with full dataset, using (50 trees in each "fold")*(10 "folds") = 500 total trees:
ada_cv = boosting.cv(formula = subcellular ~ ., data = DATA, v = 10, boos = TRUE, mfinal = 50, coeflearn = "Freund", control = rpart.control(maxdepth = 10))
### Variable importance plot (gives an error):
#varImp_ada_cv = barplot(ada_cv$importance[order(ada_cv$importance, decreasing = TRUE)], ylim = c(0,20), main = "Variables Relative Importance", col = "lightblue")
### Save actual locations of full dataset:
ACTUAL = DATA[,1]
### Calculate error rate of classification:
error_ada_cv = 1 - sum(ada_cv$class == ACTUAL)/nrow(DATA)
### Create confusion matrix and performance statistics for each location:
ada_cv_class = as.factor(ada_cv$class)
levels(ada_cv_class) = c(levels(ada_cv_class), levels(actual))
ada_cv_class = factor(ada_cv_class, levels(ada_cv_class)[order(levels(ada_cv_class))])
confusion_ada_cv = confusionMatrix(data = ada_cv_class, reference = ACTUAL)
### SAMME
### Create SAMME model with training data, using 500 trees
samme = boosting(formula = subcellular ~ ., data = train, mfinal = 500, coeflearn = "Zhu", boos = TRUE, control = cntrl)
### Variable importance plot:
varImp_samme = varImp_samme = barplot(samme$importance[order(samme$importance, decreasing = TRUE)], ylim = c(0,20), main = "Variables Relative Importance", col = "lightblue")
### Obtain predictions for testing data:
predicted_samme = predict(object = samme, newdata = test, newmfinal = 500)
### Calculate error rate of classification:
error_samme = 1 - sum(predicted_samme$class == actual)/nrow(test)
### Create confusion matrix and performance statistics for each location:
predicted_samme_class = as.factor(predicted_samme$class)
levels(predicted_samme_class) = c(levels(predicted_samme_class), levels(actual))
predicted_samme_class = factor(predicted_samme_class, levels(predicted_samme_class)[order(levels(predicted_samme_class))])
confusion_samme = confusionMatrix(data = predicted_samme_class, reference = actual)
### Save Random Forests results to machine:
write.csv(as.data.frame.matrix(confusion_RF$table), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/RF_confusion.csv")
write.csv(as.data.frame.matrix(confusion_RF$byClass), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/RF_stats.csv")
write.table(error_RF, file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/RF_stats.csv", append = TRUE)
### Save Adaptive Boosting results to machine:
write.csv(as.data.frame.matrix(confusion_ada$table), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/ada_confusion.csv")
write.csv(as.data.frame.matrix(confusion_ada$byClass), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/ada_stats.csv")
write.table(error_ada, file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/ada_stats.csv", append = TRUE)
### Save Adaptive Boosting with ten-fold cross-validation results to machine:
write.csv(as.data.frame.matrix(confusion_ada_cv$table), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/ada_cv_confusion.csv")
write.csv(as.data.frame.matrix(confusion_ada_cv$byClass), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/ada_cv_stats.csv")
write.table(error_ada_cv, file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/ada_cv_stats.csv", append = TRUE)
### Save SAMME results to machine:
write.csv(as.data.frame.matrix(confusion_samme$table), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/samme_confusion.csv")
write.csv(as.data.frame.matrix(confusion_samme$byClass), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/samme_stats.csv")
write.table(error_samme, file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/samme_stats.csv", append = TRUE)
### And now a few more methods
### Bagging (Bootstrap Aggregating)
### Create bagging model with training data, using 500 trees:
bagging = bagging(formula = subcellular ~ ., data = train, mfinal = 500, control = cntrl)
### Variable importance plot:
varImp_bagging = barplot(bagging$importance[order(bagging$importance, decreasing = TRUE)], ylim = c(0,20), main = "Variables Relative Importance", col = "lightblue")
### Obtain predictions for testing data:
predicted_bagging = predict(object = bagging, newdata = test, newmfinal = 500)
### Calculate error rate of classification:
error_bagging = 1 - sum(predicted_bagging$class == actual)/nrow(test)
### Create confusion matrix and performance statistics for each location:
predicted_bagging_class = as.factor(predicted_bagging$class)
levels(predicted_bagging_class) = c(levels(predicted_bagging_class), levels(actual))
predicted_bagging_class = factor(predicted_bagging_class, levels(predicted_bagging_class)[order(levels(predicted_bagging_class))])
confusion_bagging = confusionMatrix(data = predicted_bagging_class, reference = actual)
### Bagging with ten-fold cross-validation
### Create bagging with ten-fold cross-validation model using full dataset, using (50 trees in each "fold")*(10 "folds") = 500 total trees:
bagging_cv = bagging.cv(formula = subcellular ~ ., data = DATA, v = 10, mfinal = 50, control = rpart.control(maxdepth = 10))
### Variable importance plot (gives an error):
#varImp_bagging_cv = barplot(bagging_cv$importance[order(bagging_cv$importance, decreasing = TRUE)], ylim = c(0,20), main = "Variables Relative Importance", col = "lightblue")
### Calculate error rate of classification:
error_bagging_cv = 1 - sum(bagging_cv$class == ACTUAL)/nrow(DATA)
### Create confusion matrix and performance statistics for each location:
bagging_cv_class = as.factor(bagging_cv$class)
levels(bagging_cv_class) = c(levels(bagging_cv_class), levels(actual))
bagging_cv_class = factor(bagging_cv_class, levels(bagging_cv_class)[order(levels(bagging_cv_class))])
confusion_bagging_cv = confusionMatrix(data = bagging_cv_class, reference = ACTUAL)
### Save Bagging results to machine
write.csv(as.data.frame.matrix(confusion_bagging$table), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/bagging_confusion.csv")
write.csv(as.data.frame.matrix(confusion_bagging$byClass), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/bagging_stats.csv")
write.table(error_bagging, file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/bagging_stats.csv", append = TRUE)
### Save Bagging with ten-fold cross-validation results to machine
write.csv(as.data.frame.matrix(confusion_bagging_cv$table), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/bagging_cv_confusion.csv")
write.csv(as.data.frame.matrix(confusion_bagging_cv$byClass), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/bagging_cv_stats.csv")
write.table(error_bagging_cv, file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/bagging_cv_stats.csv", append = TRUE)
### Save R workspace
save.image("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/50_50.RData")