library(utils) ## txtProgressBar uniprot_mapping <- function(ids) { uri <- 'http://www.uniprot.org/uniprot/?query=' idStr <- paste(ids, collapse="+or+") format <- '&format=tab' fullUri <- paste0(uri,idStr,format) dat <- read.delim(fullUri) dat } ## Usage ids = read.table('genes.big.list')$V1 lenids = length(ids) pieces = 100 pb <- txtProgressBar(min = 0, max=pieces, style = 3) res <- lapply(1:pieces, function(i){ len = floor(lenids / pieces) sta = len * (i-1) end = len * i if (end > lenids){ end = lenids } setTxtProgressBar(pb, i) result = uniprot_mapping(ids[sta:end]) return(c(sta, end, result)) }) library(tibble) tibs <- lapply(1:pieces, function(i){ as.tibble(data.frame(res[[i]]))[,c("Entry", "Entry.name", "Status", "Protein.names", "Gene.names", "Organism", "Length")] }) genes.table <- do.call(rbind, tibs) write.table(genes.table, file="genes.big.table") ## We assume the gene names contain the all we need genes.table <- read.table("genes.table")$V1 gnames <- as.tibble(genes.table$Gene.names) ## and now we compare with the IDS and see whether we ## can match ATG name to a row in gnames. gvals <- unlist(gnames$value) ## flatten names into a single string df <- c() pb <- txtProgressBar(min = 0, max=length(ids), style = 3) for (i in 1:length(ids)){ gid <- ids[[i]] matched <- gvals[grepl(gid, gvals, ignore.case = T)] setTxtProgressBar(pb, i) if (length(matched) == 1){ df <- rbind(df, c(gid, matched)) } else if (length(matched) > 1) { df <- rbind(df, c(gid, paste(matched, collapse = " "))) } else { df <- rbind(df, c(gid, "")) } } dfr <- data.frame(old=df[,1], new=df[,2]) write.table(dfr, file="remapping.big.tsv", quote=TRUE, sep="\t", row.names = FALSE) ## Remove redundant names genes.table <- as_tibble(read.table(file="remapping.tsv", sep="\t", header=TRUE, fill = TRUE, quote = "")) genes.table <- as_tibble(dfr) genes.table$easy = 0 pb <- txtProgressBar(min = 0, max=length(genes.table$old), style = 3) for (i in 1:length(genes.table$old)){ gid <- genes.table[i,]$old strs <- genes.table[i,]$new ## remove annotations between '' quotes ## new.strs <- gsub("(''[^']*'')", "", strs) ## remove unique gene entries new.strs <- trimws(gsub("[\\s/]+", " ", gsub(paste0("\\s*(AXX...)*", gid, "\\s*"), " ", strs, ignore.case = T))) setTxtProgressBar(pb, i) if (length(new.strs) == 0){ new.strs = gid genes.table[i,]$easy = 2 } else { new.strs = unique(unlist(strsplit( gsub("[._;-]", "-", toupper(new.strs)), split="\\s+"))) ## filter gene names, containing at least 1 number new.strs = grep("[A-Za-z]+([0-9]+[A-Za-z-]*)+", new.strs, value = T) ## if (length(new.strs) == 1){genes.table[i,]$easy = 1} ## merge to one line new.strs = paste(new.strs, collapse = " ") } new.strs <- gsub("\\s+AXX17-\\s+", " ", new.strs) genes.table[i,]$new <- new.strs } write.table(genes.table, file="remapping.complete.tsv", quote=FALSE, sep="\t", row.names = FALSE)