### Warning: the reader may find some varible names strange ### First - install packages with install.packages() ### Note - installing BioSeqClass may be more difficult than just using install.packages("BioSeqClass") ### See http://www.bioconductor.org/packages/release/bioc/html/BioSeqClass.html for potential help ### Load packages - most of the following are used at some point: library(seqinr) library(protr) library(party) library(BioSeqClass) library(randomForest) library(lme4) library(caret) library(adabag) library(tree) library(rpart) library(DMwR) ### Read in protein data, already mostly subsetted properly using online database ### (readFASTA may not work; may have to use read.fasta (they're from different packages)): #stuff = readFASTA("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/benchmark.fasta", seqonly = FALSE) stuff = read.fasta("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/benchmark.fasta", seqtype = "AA", as.string = TRUE) vector = as.vector(stuff) ### Remove any proteins containing an X (unknown amino acid) in their sequences: tacos = NULL for(i in 1:length(vector)){ if(grepl("X", vector[i]) == TRUE){ tacos = c(tacos, i) } } vector = vector[-tacos] ### Shorten protein names to just their length 6 identifiers: names(vector) = sapply(names(vector), function(x){ substring(x, 4, 9) }) ### Set a seed for reproducability (since random sampling is coming up): set.seed(1349) ### Read in output text file from "50/50" BLASTClust, where the one or more proteins on each line represent a specific cluster: scan = scan("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/blasted.txt", what = character(), sep = "\n") ### Create an R list, each list element a cluster: list = list() for(i in 1:length(scan)){ list[[i]] = unlist(strsplit(scan[i], " ")) } ### Shorten protein names to just their length 6 identifiers: for(i in 1:length(list)){ for(j in 1:length(list[[i]])){ list[[i]][j] = substring(list[[i]][j], 4, 9) } } ### For each cluster in the list, randomly select just one protein from the cluster: proteins = NULL for(i in 1:length(list)){ proteins[i] = list[[i]][sample(1:length(list[[i]]), 1)] } ### Remove proteins that weren't previously selected: blasted_outta_here = NULL for(i in 1:length(vector)){ if((names(vector)[i] %in% proteins) == FALSE){ blasted_outta_here = c(blasted_outta_here, i) } } vector = vector[-blasted_outta_here] ### Read in list of known protein locations: locations = read.csv("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/locations_unique.csv", header = FALSE, col.names = c("Protein", "Location")) ### Transform protein sequences by means of Pseudo Amino Acid Composition: PAA = featurePseudoAAComp(vector, d = 15, w = 0.05) ### Save results to machine: write.csv(PAA, "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/50_50.csv") ### Read results back in: PAA = read.csv("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/50_50.csv", header = TRUE, row.names = 1) ### Reorder data alphabetically by protein identifier: PAA = PAA[order(row.names(PAA)),] ### Remove location listings for proteins that aren't now in our dataset: subcellular = rep("word", nrow(PAA)) a = NULL for (i in 1:nrow(locations)){ if (locations[i,1] %in% rownames(PAA) == FALSE){ a = c(a,i) } } locations = locations[-a,] ### Combine known locations and the PAA values into a data frame: subcellular = as.factor(as.character(locations$Location)) PAA = data.frame(subcellular, PAA) ### Save data frame to machine: write.csv(PAA, "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/50_50.csv") ### End of preprocessing ############################################################################################### ### Beginning of analysis ### "Refresh" seed - not really necessary: set.seed(0) ### Read data back in: DATA = read.csv("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/50_50.csv", header = TRUE, row.names = 1) ### Set a seed for reproducability (since random sampling is coming up): set.seed(69) ### Take a sample of 70% of the numbers in 1 through the number of proteins in the dataset: nums = sample(1:nrow(DATA), floor(.7*nrow(DATA))) ### Create 70% training set (and ensure that at least one protein from each location is in it ### (need to do this so models can know that each location is a "possibility"): train = DATA[nums,] while (0 %in% summary(train$subcellular) == TRUE){ nums = sample(1:nrow(DATA), floor(.7*nrow(DATA))) train = DATA[nums,] } ### Create 30% testing set: test = DATA[-nums,] ### Random Forests ### Create Random Forest model with training data - 500 trees is default: RF = randomForest(formula = subcellular ~ ., data = train, importance = TRUE, proximity = TRUE) ### Variable importance plot: varImp_RF = varImpPlot(RF) ### Obtain predictions for testing data: predicted_RF = predict(RF, newdata = test) ### Save actual locations of testing data: actual = test[,1] ### Calculate error rate of classification: error_RF = 1 - sum(predicted_RF == actual)/nrow(test) ### Create confusion matrix and performance statistics for each location: confusion_RF = confusionMatrix(data = predicted_RF, reference = actual) ### Adaptive Boosting ### Set some "appropriate" parameter options for the algorithm: cntrl = rpart.control(maxdepth = 6, minsplit = 0, cp = -1) ### Create adaptive boosting model with training data, using 500 trees: ada = boosting(formula = subcellular ~ ., data = train, mfinal = 500, coeflearn = "Freund", boos = TRUE, control = cntrl) ### Variable importance plot: varImp_ada = barplot(ada$importance[order(ada$importance, decreasing = TRUE)], ylim = c(0,20), main = "Variables Relative Importance", col = "lightblue") ### Obtain predictions for testing data: predicted_ada = predict(object = ada, newdata = test, newmfinal = 500) ### Calculate error rate of classification: error_ada = 1 - sum(predicted_ada$class == actual)/nrow(test) ### Create confusion matrix and performance statistics for each location: predicted_ada_class = as.factor(predicted_ada$class) levels(predicted_ada_class) = c(levels(predicted_ada_class), levels(actual)) predicted_ada_class = factor(predicted_ada_class, levels(predicted_ada_class)[order(levels(predicted_ada_class))]) confusion_ada = confusionMatrix(data = predicted_ada_class, reference = actual) ### Adaptive Boosting with ten-fold cross-validation ### Create adaptive boosting with ten-fold cross-validation model with full dataset, using (50 trees in each "fold")*(10 "folds") = 500 total trees: ada_cv = boosting.cv(formula = subcellular ~ ., data = DATA, v = 10, boos = TRUE, mfinal = 50, coeflearn = "Freund", control = rpart.control(maxdepth = 10)) ### Variable importance plot (gives an error): #varImp_ada_cv = barplot(ada_cv$importance[order(ada_cv$importance, decreasing = TRUE)], ylim = c(0,20), main = "Variables Relative Importance", col = "lightblue") ### Save actual locations of full dataset: ACTUAL = DATA[,1] ### Calculate error rate of classification: error_ada_cv = 1 - sum(ada_cv$class == ACTUAL)/nrow(DATA) ### Create confusion matrix and performance statistics for each location: ada_cv_class = as.factor(ada_cv$class) levels(ada_cv_class) = c(levels(ada_cv_class), levels(actual)) ada_cv_class = factor(ada_cv_class, levels(ada_cv_class)[order(levels(ada_cv_class))]) confusion_ada_cv = confusionMatrix(data = ada_cv_class, reference = ACTUAL) ### SAMME ### Create SAMME model with training data, using 500 trees samme = boosting(formula = subcellular ~ ., data = train, mfinal = 500, coeflearn = "Zhu", boos = TRUE, control = cntrl) ### Variable importance plot: varImp_samme = varImp_samme = barplot(samme$importance[order(samme$importance, decreasing = TRUE)], ylim = c(0,20), main = "Variables Relative Importance", col = "lightblue") ### Obtain predictions for testing data: predicted_samme = predict(object = samme, newdata = test, newmfinal = 500) ### Calculate error rate of classification: error_samme = 1 - sum(predicted_samme$class == actual)/nrow(test) ### Create confusion matrix and performance statistics for each location: predicted_samme_class = as.factor(predicted_samme$class) levels(predicted_samme_class) = c(levels(predicted_samme_class), levels(actual)) predicted_samme_class = factor(predicted_samme_class, levels(predicted_samme_class)[order(levels(predicted_samme_class))]) confusion_samme = confusionMatrix(data = predicted_samme_class, reference = actual) ### Save Random Forests results to machine: write.csv(as.data.frame.matrix(confusion_RF$table), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/RF_confusion.csv") write.csv(as.data.frame.matrix(confusion_RF$byClass), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/RF_stats.csv") write.table(error_RF, file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/RF_stats.csv", append = TRUE) ### Save Adaptive Boosting results to machine: write.csv(as.data.frame.matrix(confusion_ada$table), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/ada_confusion.csv") write.csv(as.data.frame.matrix(confusion_ada$byClass), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/ada_stats.csv") write.table(error_ada, file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/ada_stats.csv", append = TRUE) ### Save Adaptive Boosting with ten-fold cross-validation results to machine: write.csv(as.data.frame.matrix(confusion_ada_cv$table), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/ada_cv_confusion.csv") write.csv(as.data.frame.matrix(confusion_ada_cv$byClass), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/ada_cv_stats.csv") write.table(error_ada_cv, file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/ada_cv_stats.csv", append = TRUE) ### Save SAMME results to machine: write.csv(as.data.frame.matrix(confusion_samme$table), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/samme_confusion.csv") write.csv(as.data.frame.matrix(confusion_samme$byClass), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/samme_stats.csv") write.table(error_samme, file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/samme_stats.csv", append = TRUE) ### And now a few more methods ### Bagging (Bootstrap Aggregating) ### Create bagging model with training data, using 500 trees: bagging = bagging(formula = subcellular ~ ., data = train, mfinal = 500, control = cntrl) ### Variable importance plot: varImp_bagging = barplot(bagging$importance[order(bagging$importance, decreasing = TRUE)], ylim = c(0,20), main = "Variables Relative Importance", col = "lightblue") ### Obtain predictions for testing data: predicted_bagging = predict(object = bagging, newdata = test, newmfinal = 500) ### Calculate error rate of classification: error_bagging = 1 - sum(predicted_bagging$class == actual)/nrow(test) ### Create confusion matrix and performance statistics for each location: predicted_bagging_class = as.factor(predicted_bagging$class) levels(predicted_bagging_class) = c(levels(predicted_bagging_class), levels(actual)) predicted_bagging_class = factor(predicted_bagging_class, levels(predicted_bagging_class)[order(levels(predicted_bagging_class))]) confusion_bagging = confusionMatrix(data = predicted_bagging_class, reference = actual) ### Bagging with ten-fold cross-validation ### Create bagging with ten-fold cross-validation model using full dataset, using (50 trees in each "fold")*(10 "folds") = 500 total trees: bagging_cv = bagging.cv(formula = subcellular ~ ., data = DATA, v = 10, mfinal = 50, control = rpart.control(maxdepth = 10)) ### Variable importance plot (gives an error): #varImp_bagging_cv = barplot(bagging_cv$importance[order(bagging_cv$importance, decreasing = TRUE)], ylim = c(0,20), main = "Variables Relative Importance", col = "lightblue") ### Calculate error rate of classification: error_bagging_cv = 1 - sum(bagging_cv$class == ACTUAL)/nrow(DATA) ### Create confusion matrix and performance statistics for each location: bagging_cv_class = as.factor(bagging_cv$class) levels(bagging_cv_class) = c(levels(bagging_cv_class), levels(actual)) bagging_cv_class = factor(bagging_cv_class, levels(bagging_cv_class)[order(levels(bagging_cv_class))]) confusion_bagging_cv = confusionMatrix(data = bagging_cv_class, reference = ACTUAL) ### Save Bagging results to machine write.csv(as.data.frame.matrix(confusion_bagging$table), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/bagging_confusion.csv") write.csv(as.data.frame.matrix(confusion_bagging$byClass), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/bagging_stats.csv") write.table(error_bagging, file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/bagging_stats.csv", append = TRUE) ### Save Bagging with ten-fold cross-validation results to machine write.csv(as.data.frame.matrix(confusion_bagging_cv$table), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/bagging_cv_confusion.csv") write.csv(as.data.frame.matrix(confusion_bagging_cv$byClass), file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/bagging_cv_stats.csv") write.table(error_bagging_cv, file = "C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/bagging_cv_stats.csv", append = TRUE) ### Save R workspace save.image("C:/Users/jdmunyon/Desktop/Dropbox/Senior_Project/data_and_code/results/50_50/50_50.RData")