## # Hacked together, sorry.. # Required CSVs were tediously exported manually from GEO web interface. ## import csv import json import requests import pdb import pprint import GEOparse import xmltodict import logging GEOparse.logger.setLevel(logging.getLevelName("WARN")) AE_URL = "https://www.ebi.ac.uk/arrayexpress/ArrayExpress-Experiments.txt?array=" GEO_URL = "https://www.ncbi.nlm.nih.gov/gds?term=XXX[Related%20Series]&cmd=DetailsSearch&format=TEXT" SRA_URL = "https://www.ncbi.nlm.nih.gov/sra?term=%22instrument%20illumina%20genome%20analyzer%22%5BProperties%5D" MINIML_URL = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=XXX&targ=self&form=xml&view=quick" all_accessions = {} # First do microarray with open('config/supported_microarray_platforms.csv', 'r') as csv_in: csv_in = csv.reader(csv_in) for ccount, row in enumerate(csv_in): try: if ccount is 0: continue accession = row[1] # This is Array Express. if '-' in accession: url = AE_URL + accession tsv_text = requests.get(url, timeout=60).text.encode("utf-8").splitlines() tsv_in = csv.reader(tsv_text, delimiter='\t') for count, tsv_row in enumerate(tsv_in): if count is 0: continue ex_accesion = tsv_row[0] num_assays = tsv_row[4] all_accessions[ex_accesion] = int(num_assays) else: url = GEO_URL.replace('XXX', accession) geo_text = requests.get(url, timeout=60).text.encode("utf-8") geo_text = geo_text.split('
\n')[1] sections = geo_text.split('\n\n') for section in sections: num_samples = 0 ex_accession = None for lcount, line in enumerate(section.split('\n')): if lcount == 4: num_samples = int(line.split(' ')[-2]) if lcount == 6: ex_accession = line.split('Accession: ')[1].split('\t')[0] all_accessions[ex_accession] = num_samples except Exception as e: # These are always empty search results print(url) print(e) continue # Then, do RNASeq with open('config/supported_rnaseq_platforms.txt', 'r') as txt_file: lines = txt_file.readlines() for line in lines: with open('accessions/' + line.strip() + '.csv', 'r') as accession_file: csv_in = csv.reader(accession_file) for ccount, row in enumerate(csv_in): try: if ccount is 0: continue srp = row[5] current = all_accessions.get(srp, 0) all_accessions[srp] = current + 1 except Exception as e: print(e) # Prefer GEO over AE all_keys = all_accessions.keys() for key in all_keys: if "E-GEOD-" in key: gse = key.replace('E-GEOD-', 'GSE') if gse in all_keys: print("Preferring " + gse + " over " + key) del all_accessions[key] # Load our GEO/GPL map unique = [] with open('config/geo_gpl_map.json') as json_data: geo_map = json.load(json_data) for name, gpls in geo_map.items(): try: for gpl in gpls: g = GEOparse.get_GEO(gpl, destdir='/tmp') serieses = g.metadata['series_id'] for series in serieses: murl = MINIML_URL.replace('XXX', series) miniml_text = requests.get(murl, timeout=60).text.encode("utf-8") if '