'''This program takes as input an ontology and an association file, downloaded from geneontology.org and saved in txt format. It parses through the files and creates an annotation file that can be used by CRFE.''' import optparse class Ontology_Parser: def __init__(self,ontology_file,association_file,flag,out_str='',verbose=True,combine=True,save=False): self.ontology_file=ontology_file self.association_file=association_file self.flag=flag if flag=='P': self.namespace='biological_process' elif flag=='F': self.namespace='molecular_function' elif flag=='C': self.namespace='cellular_component' self.out_str=out_str self.verbose=verbose self.combine=combine self.save=save #Create all class variables self.dict_nr_to_id={} self.term_ids=[] self.term_names=[] self.parent_terms=[] self.unique_genes=[] self.ancestor_set=set() self.T_help=[[]] self.T=[set()] self.unique_genes=[] self.dict_unique_genes_to_nr=dict() def all_ancestors(self,term,internal_dummy=True): if type(term)==str: try: term=self.dict_nr_to_id[term] except: return [] if type(term)!=int: return [] else: if internal_dummy==True: self.ancestor_set=set() self.ancestor_set.add(term) for parent in set(self.parent_terms[term])-self.ancestor_set: self.all_ancestors(parent,False) def save_all(self): if self.verbose: print('Writing input for CRFE into '+'saved_data/save_'+self.out_str+'_1to0annotations_....txt'+' ...') self.savevar('T', self.T) self.savevar('term_names', self.term_names) self.savevar('unique_genes', self.unique_genes) def savevar(self, variable, v): """ to save one of three important variables: T, unqiue_genes, term_names """ try: import sys import os if sys.version_info[0] < 3: import cPickle as pickle else: import pickle except ImportError: print('Could not import the module sys, os or cPickle (Python 2.x) / pickle (Python 3)') if not os.path.exists('saved_data/'): os.makedirs('saved_data/') f=open('saved_data/save_'+self.out_str+'_1to0annotations_'+variable+'.txt','wb') pickle.dump(v, f) f.close() def run(self): if self.verbose: print('Parsing ontology file...') reader=open(self.ontology_file,'r') within_term_description=False i=0 count=-1 for row in reader: #print within_term_description,row[:-1] if row=="[Term]\n": within_term_description=True elif within_term_description and row[:7]=="id: GO:": possible_new_term_id=row[7:-1] elif within_term_description and row[:5]=="name:": possible_new_term_name= row[6:-1] elif within_term_description and row[:10]=="namespace:": if row[11:-1]!=self.namespace: within_term_description=False else: count+=1 self.term_ids.append(possible_new_term_id) self.dict_nr_to_id.update({possible_new_term_id:count}) self.term_names.append(possible_new_term_name) self.parent_terms.append([]) elif within_term_description and row[:-1]=="is_obsolete: true": within_term_description=False self.term_ids.pop() self.term_names.pop() self.dict_nr_to_id.pop(possible_new_term_id) self.parent_terms.pop() count-=1 elif within_term_description and row[:5]=="is_a:": self.parent_terms[count].append(row[9:16]) i+=1 for i in range(len(self.parent_terms)): for j in range(len(self.parent_terms[i])): self.parent_terms[i][j]=self.dict_nr_to_id[self.parent_terms[i][j]] #nr_parents=np.array([len(c) for c in parent_terms]) #ONTOLOGY IS NOW COMPLETELY LOADED AND TRANSFORMED self.T_help=[[] for term in self.term_ids] self.T=[set() for term in self.term_ids] if self.verbose: print('Parsing association file...') f=open(self.association_file,'r') reader=f.read().splitlines() f.close() i=0 count_genes=0 count_annotations=0 for row in reader: i+=1 array=row.split('\t') if row[0] not in ["!",'"']: if array[8]==self.flag: try: self.T_help[self.dict_nr_to_id[array[4][3:]]].append('a') #'a'=random value just to see whether term is in ontology count_annotations+=1 try: self.T_help[self.dict_nr_to_id[array[4][3:]]][-1]=self.dict_unique_genes_to_nr[array[2]] except KeyError: #If it is the first occurence of the gene self.T_help[self.dict_nr_to_id[array[4][3:]]][-1]=count_genes self.unique_genes.append(array[2]) self.dict_unique_genes_to_nr.update({array[2]:count_genes}) count_genes+=1 except KeyError: # if the term is not in the loaded ontology pass ancestor_terms=[] for i in range(len(self.parent_terms)): self.all_ancestors(i) ancestor_terms.append(self.ancestor_set) #ancestor_terms[i].remove(i) self.T_help=list(map(set,self.T_help)) for i in range(len(self.parent_terms)): if self.T_help[i]!=set(): for ancestor in ancestor_terms[i]: self.T[ancestor].update(self.T_help[i]) tt=[len(t) for t in self.T] #Delete all terms without annotations if self.verbose: print('Removing all functional categories without any associations...') to_be_removed=[] for i in range(len(tt)): if tt[i]==0: to_be_removed.append(i) to_be_removed.sort(reverse=True) for term in to_be_removed: self.T.pop(term) self.term_names.pop(term) self.term_ids.pop(term) self.parent_terms.pop(term) ancestor_terms.pop(term) tt=[len(t) for t in self.T] #Combine equal terms if self.combine: if self.verbose: print('Combining functional categories with exactly the same associations...') ind=sorted(range(len(tt)), key=lambda k: tt[k]) tt_ord=[tt[ind[i]] for i in range(len(tt))] res=[] for i in range(len(tt_ord)): for j in range(i+1,len(tt_ord)): if tt_ord[i]