## Scripts for updating specData ## Download and unpack mapping file: ## ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz ## wget and tar -xzf ## Generates specData .processTaxNamesFile <- function(filesDir=getwd()){ ## species <- read.delim('names.dmp',header = FALSE,sep = "|") dest <- file.path(filesDir, "names.dmp") data <- read.delim(dest, header=FALSE, sep="\t", quote="", stringsAsFactors=FALSE) species <- data[,seq(1, dim(data)[2], by=2)] ## Throw away 'pipe columns' colnames(species) <- c('tax_id','name_txt','unique_name','name_class') ## keep only some cols species <- species[,c(1:2,4)] ## throw away tabs from second col species[[2]] <- gsub('\t','',species[[2]]) ## And the third col species[[3]] <- gsub('\t','',species[[3]]) ## throw away rows where the third column doesn't say 'scientific name' keep <- grepl('scientific name', species[[3]]) keep2 <- grepl('synonym', species[[3]]) species <- species[(keep | keep2), 1:2] ## split second column by first space: rawSpec <- species[[2]] spltSpec <- strsplit(rawSpec, split=" ") genusDat <- unlist(lapply(spltSpec, function(x){x[1]})) .getRest <- function(x){ if(length(x) > 1){ return(paste(x[2:length(x)], collapse=" ")) }else{ return(NA) } } speciesDat <- unlist(lapply(spltSpec, .getRest)) specData <- data.frame(tax_id=as.integer(species[[1]]), ## integer genus=as.factor(genusDat), ## factor species=speciesDat, ## character stringsAsFactors=FALSE) save(specData, file='specData.rda', compress="xz") }