'''This program implements the functional enrichment method CRFE, described in C. Kadelka, M. Brandon, T. M. Murali, Concise Functional Enrichment of Ranked Gene Lists. See www.math.vt.edu/people/claus89/crfe/ for help.''' import sys import csv, math if sys.version_info[0] < 3: import cPickle as pickle else: import pickle import random import optparse import datetime import os import time def hist(lst,steps=10,mi=148.8,ma=148.8): if mi==148.8: mi=min(lst) if ma==148.8: ma=max(lst) h=(ma-mi)*1./steps #print steps,h, ma, mi c=[0 for i in range(len(lst))] lim=[0 for i in range(steps)] for i in range(steps): lim[i]=float(mi+i*h) for j in range(len(lst)): if lst[j]>=mi+i*h: c[j]+=1 d=[] for i in range(steps): d.append(c.count(i+1)) return (d,lim) def mean(vec): return sum(vec)*1./len(vec) def std(vec): mean_vec=mean(vec) return math.sqrt(sum([(el-mean_vec)**2 for el in vec])*1./len(vec)) def search(term_names,cc,word): print('ID in lst\tnr genes\tname') res=[] for item,i in zip(term_names,range(len(term_names))): if word in item: print(i,'\t\t',cc[i],'\t\t',item) res.append(i) return res def discretize(vector,value): diff=[math.fabs(v-value) for v in vector] ind=sorted(range(len(vector)), reverse=False, key=lambda k: diff[k]) return (vector[ind[0]],ind[0]) class CRFE: ############################## ### Initializing Methods ### ############################## def __init__(self, repeats, nr_categories, lower_cutoff, upper_cutoff, belief, threshold, threshold_type, burnin_steps, MCMC_steps, alpha_beta_max, proportion_parameter_change, A, P, gene_file, gene_file_use, annotations_file, output_folder, out_str='', seed=-1): self.repeats = repeats self.nr_categories = nr_categories self.lower_cutoff = lower_cutoff self.upper_cutoff = upper_cutoff self.belief = belief self.gene_file_use = gene_file_use #how to use expression value list; 0 (normal),1 (inverted),2 (absolute value) if gene_file_use==1: self.threshold=-1*threshold else: self.threshold=threshold self.threshold_type=threshold_type self.burnin_steps = burnin_steps self.MCMC_steps = MCMC_steps self.alpha_beta_max = alpha_beta_max self.proportion_parameter_change=proportion_parameter_change self.A = A self.P = P self.gene_file = gene_file self.annotations_file = annotations_file self.out_str = out_str self.output_folder = output_folder #Make everything class variables self.G=[] self.T=[] self.unique_genes=[] self.term_names=[] self.seed=seed self.show_genes_in_output=True @staticmethod def uniq(input): temp = set(input) return list(temp) def get_sets_and_genes(self): """takes the hdfs data set and calculates the set of highly activated and low activated genes""" self.genes_list=[] self.level_list=[] #if 'FULL' not in self.out_str: mi=self.threshold gf = open(self.gene_file) if self.gene_file[-3:]=='csv': reader = csv.reader(gf, delimiter='\t') else: reader = open(self.gene_file,'r') c=0 if self.gene_file[-3:]=='txt': for row in reader.read().splitlines(): row=row.split('\t') try: self.level_list.append(float(row[1])) self.genes_list.append(row[0]) if self.gene_file_use==0 and float(row[1])>mi or self.gene_file_use==1 and -float(row[1])>mi or self.gene_file_use==2 and abs(float(row[1]))>mi: c+=1 except Exception: continue else: #.csv for row in reader: try: self.level_list.append(float(row[1])) self.genes_list.append(row[0]) if self.gene_file_use==0 and float(row[1])>mi or self.gene_file_use==1 and -float(row[1])>mi or self.gene_file_use==2 and abs(float(row[1]))>mi: c+=1 except Exception: continue l1=list(range(c)) l1.append(len(self.genes_list)) if self.gene_file_use == 1: self.genes_list.reverse() self.level_list=[-self.level_list[i] for i in range(len(self.level_list)-1,-1,-1)] if self.gene_file_use == 2: ind=sorted(range(len(self.level_list)), reverse=True, key=lambda k: abs(self.level_list[k])) self.genes_list=[self.genes_list[i] for i in ind] self.level_list=[abs(self.level_list[i]) for i in ind] gf.close() return def find_min_level_of_activation(self,threshold): nr_active=int(round(len(self.unique_genes)*float(threshold))) counter=0 BROKE=0 for i in range(len(self.genes_list)): if self.genes_list[i] in self.unique_genes: counter+=1 if counter>=nr_active: broke_at=i BROKE=1 break if BROKE==0: broke_at=len(self.genes_list)-1 nr_active=len(self.unique_genes) counter=len(set(self.unique_genes)&set(self.genes_list[:counter])) help=self.level_list[broke_at] if int(self.gene_file_use)==1: return -help else: return help def getG(self): """Distributes genes from real data's genes_list into activity categories based on the bounds from self.get_sets_and_genes""" if self.nr_categories==0: #Singleton: self.nr_categories really is len(bounds)-2 in this case as well self.G=[]#set([unique_genes.index(genes_list[j])]) for j in range(len(bounds)-1)] for i in range(len(self.genes_list)): if self.level_list[i]>self.threshold_for_activation: try: ind=self.unique_genes.index(self.genes_list[i]) self.G.append(set([ind])) except Exception: pass else: break self.dict_G=dict((list(self.G[i])[0],i) for i in range(len(self.G)))#{list(self.G[i])[0]:i for i in range(len(self.G))} self.G.append(set(range(len(self.unique_genes)))-set(self.dict_G.keys())) else: self.G=[[] for i in range(self.nr_categories+1)] perturbed=[] j=0 while self.level_list[j]>self.threshold_for_activation: try: ind=self.unique_genes.index(self.genes_list[j]) perturbed.append(ind) except Exception: pass j+=1 while j0: #FIXED error in formula, 32d_web and later summe=0 for i in range(s): summe += (s-1-i+self.belief*i)*G_lengths[i] rates.append(falserate*(s-1)*nr_perturbed*1./summe) else: #For singleton case, a simpler formula suffices since G_lengths[i]==1 for all i rates.append(2.*falserate/(self.belief+1)) for i in range(1,s): #Once rates[0] is found, multiply by linearly increasing constant to get rates[1],...,rates[s-1] rates.append(rates[0]*((self.belief-1)*1.*i/(s-1)+ 1)) return rates ################################# ### MCMC Code ### ################################# def propose_state(self,neighborhood_size): self.proposal_toggle, self.proposal_delete, self.proposal_add = -1,-1,-1 r=random.random() if (r > self.proportion_parameter_change): proposal = int(random.random()*neighborhood_size) if (proposal < self.number_of_sets): self.proposal_toggle = proposal self.toggle_state(proposal) #learn prob every time because it is easy self.old_nr_prob=self.nr_prob self.old_prob=self.prob self.nr_prob=min(max(1,self.number_of_sets-self.number_of_unselected_sets),self.P) self.prob=self.nr_prob*1./self.number_of_sets else: proposal -= self.number_of_sets selected_term_pos = int(proposal / self.number_of_unselected_sets) + self.number_of_unselected_sets unselected_term_pos = int(proposal % self.number_of_unselected_sets) self.proposal_delete = self.set_partition[selected_term_pos] self.proposal_add = self.set_partition[unselected_term_pos] self.toggle_state(self.proposal_delete) self.toggle_state(self.proposal_add) else: if (r < self.proportion_parameter_change/2): #change alpha self.old_nr_alpha = self.nr_alpha self.nr_alpha=int(random.random()*self.A) self.alpha = self.ALPHA_BETA[self.nr_alpha] self.proposal_which_parameter_changed = 0 else: #change beta self.old_nr_beta = self.nr_beta self.nr_beta=int(random.random()*self.A) self.beta = self.ALPHA_BETA[self.nr_beta] self.old_EP=self.EP self.log_1_m_betas=self.possible_log_1_m_betas[self.nr_beta][:] if self.nr_categories==1: self.EP = math.log(self.beta)*self.number_explained_perturbed_genes else: self.EP = sum([self.log_1_m_betas[i] for i in self.explained_perturbed_genes[:self.number_explained_perturbed_genes]]) self.proposal_which_parameter_changed = 1 def hidden_member_selected(self,member): if (self.perturbed_genes[member]): self.EP+=self.log_1_m_betas[member] #EP self.NP-=self.alpha_weights[self.dict_G[member]] #NP self.explained_perturbed_genes[self.number_explained_perturbed_genes]=member self.pos_explained_perturbed_genes[member]=self.number_explained_perturbed_genes self.number_explained_perturbed_genes+=1 else: #observation is false negative rather than true negative self.EU+=1 self.NU-=1 def hidden_member_unselected(self,member): if (self.perturbed_genes[member]): self.EP-=self.log_1_m_betas[member] #EP self.NP+=self.alpha_weights[self.dict_G[member]] #NP self.number_explained_perturbed_genes-=1 self.explained_perturbed_genes[self.pos_explained_perturbed_genes[member]]=self.explained_perturbed_genes[self.number_explained_perturbed_genes] self.pos_explained_perturbed_genes[self.explained_perturbed_genes[self.number_explained_perturbed_genes]]=self.pos_explained_perturbed_genes[member] else: self.EU-=1 #EU self.NU+=1 #NU def add_set(self,to_add): if (self.sets_selected[to_add]): return self.sets_selected[to_add] = 1 #Go through all associations of the term and increase the activation count for i in range(self.sizes_of_sets[to_add]): #sizes of sets == cc member = self.T[to_add][i] if self.hidden_count[member]==0: #A not yet selected member is about to be selected self.hidden_member_selected(member) self.hidden_count[member]+=1 #Move the added set from the 0 partition to the 1 partition (it essentially becomes #new first element of the 1 element, while the last 0 element gets its original position) */ self.number_of_unselected_sets-=1 if (self.number_of_unselected_sets != 0): pos = self.position_of_set_in_partition[to_add] e0 = self.set_partition[self.number_of_unselected_sets] #Move last element in the partition to left self.set_partition[pos] = e0 self.position_of_set_in_partition[e0] = pos #Let be the newly added term the first in the partition self.set_partition[self.number_of_unselected_sets] = to_add self.position_of_set_in_partition[to_add] = self.number_of_unselected_sets def remove_set(self,to_remove): if self.sets_selected[to_remove]==0: return self.sets_selected[to_remove] = 0 #Go through all associations of the term and decrease the activation count for i in range(self.sizes_of_sets[to_remove]): member = self.T[to_remove][i] if (self.hidden_count[member] == 1): #A active member is about to be deactivated self.hidden_member_unselected(member) self.hidden_count[member]-=1 #Converse of above. Here the removed set, which belonged to the 1 partition, #is moved at the end of the 0 partition while the element at that place is #pushed to the original position of the removed element. if (self.number_of_unselected_sets != (self.number_of_sets - 1)): pos = self.position_of_set_in_partition[to_remove] e1 = self.set_partition[self.number_of_unselected_sets] self.set_partition[pos] = e1 self.position_of_set_in_partition[e1] = pos self.set_partition[self.number_of_unselected_sets] = to_remove self.position_of_set_in_partition[to_remove] = self.number_of_unselected_sets self.number_of_unselected_sets+=1 def toggle_state(self,to_switch): if (self.sets_selected[to_switch]==0): self.add_set(to_switch) else: self.remove_set(to_switch) def undo_proposal(self): if (self.proposal_toggle != -1): self.toggle_state(self.proposal_toggle) self.prob=self.old_prob self.nr_prob=self.old_nr_prob elif (self.proposal_delete != -1): #then also self.proposal_add != -1 self.toggle_state(self.proposal_delete) self.toggle_state(self.proposal_add) elif self.proposal_which_parameter_changed==0: #alpha was changed self.nr_alpha = self.old_nr_alpha self.alpha = self.ALPHA_BETA[self.nr_alpha] else: #beta was changed self.nr_beta = self.old_nr_beta self.beta = self.ALPHA_BETA[self.nr_beta] self.log_1_m_betas=self.possible_log_1_m_betas[self.nr_beta][:] self.EP=self.old_EP def get_score(self): L = self.EP + (self.nr_perturbed-self.number_explained_perturbed_genes)*math.log(self.alpha)+ self.NP + math.log(1.0-self.alpha) * self.NU + math.log(self.beta)*self.EU if self.prob!=1: L += math.log(self.prob) * (self.number_of_sets - self.number_of_unselected_sets) + math.log(1.0-self.prob) * self.number_of_unselected_sets return L def initialize(self,IS=[]): s=self.nr_categories if s==0: s=len(self.G)-1 self.nr_perturbed,self.nr_unperturbed=self.number_of_genes-len(self.G[-1]),len(self.G[-1]) self.hidden_count=[0 for i in range(self.number_of_genes)] self.sets_selected=[0 for i in range(self.number_of_genes)] self.number_of_sets=len(self.T) self.number_of_unselected_sets=self.number_of_sets self.sizes_of_sets = [len(t) for t in self.T] #Learn self.alpha_beta_max self.G_lengths = [len(item) for item in self.G] denom_rate_1=1 for i in range(1,s): denom_rate_1 *= ((self.belief-1)*1.*i/(s-1)+ 1)**(self.G_lengths[i]*1./self.nr_perturbed) if self.nr_categories!=1: self.alpha_beta_max = min(self.alpha_beta_max,1./(1+self.belief*1./denom_rate_1)) else: #MGSA has 0.5 as maximal false rate self.alpha_beta_max = min(self.alpha_beta_max,0.5) if self.proportion_parameter_change>0: #set alpha,beta,prob initially to closest allowed value const=self.alpha_beta_max*1./self.A #const equaled 0.05 in earlier versions self.ALPHA_BETA=[const*(i+1) for i in range(self.A)] if self.ALPHA_BETA[-1]==1: #can't happen anymore when self.alpha_beta_max is learned self.ALPHA_BETA=self.ALPHA_BETA[:-1] self.A-=1 self.alpha_weights=list(map(math.log,self.get_specific_rates(self.G,1,self.levels))) possible_alphas_betas=[self.get_specific_rates(self.G,self.ALPHA_BETA[i],self.levels) for i in range(self.A)] self.possible_log_alphas = [map(math.log,possible_alphas_betas[i]) for i in range(self.A)] pre_possible_log_1_m_betas = [[math.log(1-val) for val in possible_alphas_betas[i]] for i in range(self.A)] self.possible_log_1_m_betas=[[0 for gene in range(self.number_of_genes)] for i in range(self.A)] for i in range(self.A): if self.nr_categories==0: for j in range(self.nr_perturbed): self.possible_log_1_m_betas[i][list(self.G[j])[0]]=pre_possible_log_1_m_betas[i][j] else: for j in range(self.nr_categories): for jj in self.G[j]: self.possible_log_1_m_betas[i][jj]=pre_possible_log_1_m_betas[i][j] self.max_sum_log_1_m_betas=[sum(self.possible_log_1_m_betas[i]) for i in range(self.A)] self.ab=possible_alphas_betas (self.alpha,self.nr_alpha)=discretize(self.ALPHA_BETA,0.1) (self.beta,self.nr_beta)=discretize(self.ALPHA_BETA,0.25) self.prob=min(max(len(IS),1),self.P)*1./self.number_of_sets self.nr_prob=min(max(len(IS),1),self.P)-1 self.log_alphas=self.possible_log_alphas[self.nr_alpha] self.log_1_m_betas=self.possible_log_1_m_betas[self.nr_beta] else: alphas=self.get_specific_rates(self.G,self.alpha,self.levels) betas=self.get_specific_rates(self.G,self.beta,self.levels) self.log_alphas = map(math.log,alphas) self.log_1_m_betas = [math.log(1-val) for val in betas] self.set_partition=list(range(self.number_of_sets)) self.position_of_set_in_partition=list(range(self.number_of_sets)) self.perturbed_genes = [1 for i in range(self.number_of_genes)] for unperturbed_gene in self.G[-1]: self.perturbed_genes[unperturbed_gene]=0 #Initially, no set is active, hence all observations that are true are false positives... */ self.NP = sum(self.alpha_weights) self.NU = self.number_of_genes - self.nr_perturbed self.EU,self.EP = 0,0 self.number_explained_perturbed_genes=0 self.explained_perturbed_genes=[-1 for i in range(self.nr_perturbed)] self.pos_explained_perturbed_genes=[-1 for i in range(self.number_of_genes)] self.score=self.get_score() self.max_score= self.score self.best_set = [] self.term_count = [0]*self.number_of_sets def MCMC_alg(self,IS=[],verbose=0, levels=[]): """ kind==1:probabilistic generative model for GO enrichment analysis, kind==2: Going bayesian L value Runs the algorithm, optional input ouput can take on values 0,1,2: the higher the more output is produced, optional start with another initial set IS """ MCMC_start=time.time() self.initialize(IS) learned_params=[] l=1 total_steps=self.burnin_steps + self.MCMC_steps alpha_count = [0]*self.A beta_count = [0]*self.A prob_count = [0]*self.P neighborhood_size = self.number_of_sets + self.number_of_unselected_sets * (self.number_of_sets - self.number_of_unselected_sets) PRINT_AFTER_X_STEPS=(50000 if total_steps>500000 else self.MCMC_steps/50) while (l < total_steps): self.propose_state(neighborhood_size) new_score = self.get_score() if self.proposal_toggle!=-1: new_neighborhood_size = self.number_of_sets + self.number_of_unselected_sets * (self.number_of_sets - self.number_of_unselected_sets) log_accept_probability = (new_score - self.score) + math.log(neighborhood_size*1./new_neighborhood_size) else: log_accept_probability = new_score - self.score if (math.log(random.random()) >= log_accept_probability): self.undo_proposal() else: self.score = new_score if self.proposal_toggle!=-1: neighborhood_size = new_neighborhood_size #record_activity(l, score) if (self.score > self.max_score): #Remember the score and the configuration self.max_score = self.score self.max_score_step = l self.max_score_alpha = self.alpha self.max_score_beta = self.beta self.max_score_prob = self.prob self.best_set = self.set_partition[self.number_of_unselected_sets:self.number_of_sets] self.max_score_sets_selected_length = self.number_of_sets - self.number_of_unselected_sets if l>=self.burnin_steps: for i in range(self.number_of_unselected_sets,self.number_of_sets): self.term_count[self.set_partition[i]] += 1 if self.proportion_parameter_change>0: alpha_count[self.nr_alpha] += 1 beta_count[self.nr_beta] += 1 prob_count[self.nr_prob-1] += 1 l += 1 if l%PRINT_AFTER_X_STEPS==0: if verbose>0: print("%i out of %i steps completed; passed MCMC time %f" % (l,total_steps,time.time()-MCMC_start)) if verbose>1: print(l, self.score, self.alpha,self.beta, self.nr_prob, self.EP,self.NP,self.EU,self.NU) learned_params.append([l,self.score,self.alpha,self.beta,str(int(round(self.prob*len(self.T))))+"/"+str(self.number_of_sets),self.prob,self.number_of_sets - self.number_of_unselected_sets]) # Divide the count vector by MCMC_steps to get the final probability distribution Final_Distribution = [x*1./self.MCMC_steps for x in self.term_count] # Useful to return a best term set - can base this off of an arbitrary cutoff of appearances in the chain # Order the terms from highest frequency to lowest if self.proportion_parameter_change>0: hist_alpha=([a*1./self.MCMC_steps for a in alpha_count],self.ALPHA_BETA) hist_beta=([b*1./self.MCMC_steps for b in beta_count],self.ALPHA_BETA) hist_prob=([p*1./self.MCMC_steps for p in prob_count],range(1,self.P+1)) if self.proportion_parameter_change>0: return (self.max_score, Final_Distribution, learned_params, hist_alpha, hist_beta, hist_prob) else: return (self.max_score, Final_Distribution, learned_params,[],[],[]) ######################################## ### Data analysis/plotting methods ### ######################################## def mean_gene_level(self,C,G,T,levels,categories_of_G=[]): """For a list of GO terms, C, this method returns the average prediction level of all genes annotated by a term""" TT=[0 for t in T] if categories_of_G!=[]: Gunion=set.union(*[set([])] + [G[i] for i in categories_of_G]) for c in C: TT[c]=list(set(T[c])&Gunion) output=[] for c in C: if categories_of_G!=[]: helper=[levels[TT[c][i]] for i in range(len(TT[c]))] else: helper=[levels[T[c][i]] for i in range(len(T[c]))] res=mean(helper) if helper!=[] else 0 output.append(res) return output def jaccard(self, liste,T,term_names,verbose=False): """Returns a table, in which all terms in liste are pairwise compared w.r.t their Jaccard index""" res=[[0 for i in range(len(liste))] for j in range(len(liste))] for i in range(len(liste)): for j in range(i+1,len(liste)): if i==j: continue if T[liste[i]]!=[] or T[liste[j]]!=[]: res[i][j]=len(set(T[liste[i]])&set(T[liste[j]]))*1./len(set(T[liste[i]])|set(T[liste[j]])) res[j][i]=res[i][j] if verbose>0: print(term_names[liste[i]],'&',term_names[liste[j]],res[i][j]) if len(liste)==2: return res[0][1] else: return res ######################### ### P-value methods ### ######################### @staticmethod def hypergeometric_pmf(x, m, n, k): """ Given a population consisting of `m` items of class M and `n` items of class N, this returns the probability of observing `x` items of class M when sampling `k` times without replacement from the entire population (i.e., {M,N}) p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k) """ a = math.log(CRFE.binomial_coefficient(m, x)) b = math.log(CRFE.binomial_coefficient(n, k-x)) c = math.log(CRFE.binomial_coefficient(m+n, k)) return math.exp(a+b-c) ## From dendropy.mathlib.probability @staticmethod def binomial_coefficient(population, sample): "Returns `population` choose `sample`." s = max(sample, population - sample) assert s <= population assert population > -1 if s == population: return 1 numerator = 1 denominator = 1 for i in range(s+1, population + 1): numerator *= i denominator *= (i - s) return numerator/denominator def p_value(self, nr_annotated_in_GOterm,sizeGOterm,sizeV,sizeC): s=0 for i in range(nr_annotated_in_GOterm,min(sizeC,sizeGOterm)+1): try: s+=CRFE.hypergeometric_pmf(i,sizeGOterm,sizeV-sizeGOterm,sizeC) except AssertionError: return 'UNKNOWN' return s def p_values_of_list(self, liste): sizeC = 0 lenG = len(self.G) for i in range(lenG - 1): sizeC += len(self.G[i]) sizeV = sizeC + len(self.G[-1]) p = [] for i in range(len(liste)): myTset = set(self.T[liste[i]]) anded=myTset-set(self.G[-1]) p.append(self.p_value(len(anded),len(self.T[liste[i]]),sizeV,sizeC)) return p ################################################## ### Methods for initial clean-up of ontology ### ################################################## def find_equal_terms(self): cc=[len(self.T[i]) for i in range(len(self.T))] ind=sorted(range(len(cc)), key=lambda k: cc[k]) cc_ord=[cc[ind[i]] for i in range(len(cc))] res=[] for i in range(len(cc)): for j in range(i+1,len(cc)): if cc_ord[i]0: "New combined nodes:\n" for name in self.term_names: if name.find('&')!=-1: print(name) def remove_genes_at_most_annotated_to_root(self,kind=1): #print 'remove genes at most annotated to the root: biological process...' lug=len(self.unique_genes) if kind==1: toberemoved=set(range(lug))-set.union(*[set([])] + [set(t) for t in self.T if len(t)=lower_cutoff and cc[i]>upper_cutoff) or cc[i]2 or " " not in a[1]: annotated_genes=a[1:] elif " " in a[1]: annotated_genes=a[1].split(' ') for gene in annotated_genes: try: T[i].append(dict_genes[gene]) except Exception: if gene in self.genes_list: dict_genes.update({gene:number_of_genes}) unique_genes.append(gene) T[i].append(number_of_genes) number_of_genes+=1 i+=1 f.close() cc=[len(T[i]) for i in range(len(T))] for i in range(len(cc)-1,-1,-1): if ((self.upper_cutoff>self.lower_cutoff and cc[i]>self.upper_cutoff or cc[i]False positive rate (alpha):%s" % (self.alpha) strings+="False negative rate (beta):%s" % (self.beta) strings+="Penalization parameter (q):%s" % (self.prob) strings+="Penalization per term:%s" % (-math.log(self.prob/(1-self.prob))) strings+="" if hist_alpha!=[]: strings+="

Posterior Parameter Value Distribution

" strings+="" for i in range(len(hist_alpha[1])): strings+="" % (round(hist_alpha[1][i],3)) strings+="" for i in range(len(hist_alpha[1])): strings+="" % (round(hist_alpha[0][i],3) if 1>hist_alpha[0][i]>0 else int(hist_alpha[0][i])) strings+="" for i in range(len(hist_beta[1])): strings+="" % (round(hist_beta[0][i],3) if 1>hist_beta[0][i]>0 else int(hist_beta[0][i])) strings+="
Value%s
alpha%s
beta%s

" strings+="" % nr_terms for i in range(len(hist_prob[1])): strings+="" % (hist_prob[1][i]) strings+="" for i in range(len(hist_prob[1])): strings+="" % (round(hist_prob[0][i],3) if 1>hist_prob[0][i]>0 else int(hist_prob[0][i])) strings+="
q*%i%s
distr %s
" strings+="

Time course

" for i in range(7): strings+="" % names[i] strings+="\n" for j in range(len(learned_params)): strings+="" % ("#EEEEEE" if j%2==0 else "#FFFFFF") for i in range(7): strings+="" % (str(round(learned_params[j][i],3) if i in [1,2,3] else (round(learned_params[j][i],5) if i==5 else learned_params[j][i]))) strings+="\n" f_out.write(strings) f_out.write("
%s
%s
\n\n") f_out.flush() f_out.close() def writeOut(self, filename, IS, mean_max_score, std_max_score, enriched, ps, mean_gene_level, method, avg_posteriors, std_posteriors): if self.gene_file_use==1: mean_gene_level=[-m for m in mean_gene_level] filename=self.output_folder+'result_'+filename if not os.path.exists(self.output_folder): os.makedirs(self.output_folder) f_out = open(filename+".txt", 'w+') text=["Creation Time:\t" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")+"\n\n"] text.append("INPUT:\n\n") str1 = "Annotations file:\t%s\nGene file:\t%s\nUse of gene file:\t%s\nlower limit annotations/process:\t%s\nupper limit annotations/process:\t%s\nk (number of perturbed categories):\t%s\nbelief:\t%s\nMethod:\t%s\nrepeats:\t%s\nburnin steps:\t%s\nMCMC steps:\t%s\nMaximal value for alpha and beta:\t%s\nProbability parameter change:\t%s\nseed for RNG:\t%s\n" % (self.annotations_file, self.gene_file, ("normally ordered list" if self.gene_file_use==0 else ("list ordered by absolute value" if self.gene_file_use==2 else "inverted list")), self.lower_cutoff, self.upper_cutoff if self.upper_cutoff>=self.lower_cutoff else "0 (i.e., None)", self.nr_categories if self.nr_categories>1 and self.nr_categories<=len(self.T) else "0 (i.e., each perturbed gene in its own category)", self.belief, method[0], self.repeats, self.burnin_steps, self.MCMC_steps, self.alpha_beta_max, self.proportion_parameter_change, self.seed) text.append(str1 + "\n") text.append("Initial set of processes:\n") if IS==[]: text.append('Empty set\n') else: for i in IS: text.append(str(i) + "\t" + self.term_names[i]+"\n") text.append("\n\nINPUT ANALYSIS:\n\n") text.append("Total number of processes:\t"+ str(len(self.T))+"\n") text.append("Total number of genes:\t"+ str(len(self.G[-1])+len(self.G)-1)+"\n") text.append("Number of perturbed genes:\t" +str(len(self.G)-1)+ "\n") text.append("\n\n") text.append("\nOUTPUT:\n") text.append("\nLearnt parameters:\n") str2 = "alpha (FPR):\t%s\nbeta (FNR):\t%s\nq (penalization parameter):\t%s\npenalization per term, i.e., -log(q/(1-q)):\t%s\n" % ( self.alpha, self.beta, self.prob, -math.log(self.prob/(1-self.prob))) text.append(str2+"\n\n") text.append("Best found log likelihood value:\t" + str(mean_max_score) + "+-"+str(std_max_score)+"\n\n\n") if avg_posteriors==[]: text.append("Order\t#(Annotations)\t#(Annotations of perturbed genes)\tp-value\tAverage Expression Level\tTerm Name" ) else: text.append("Order\t#(Annotations)\t#(Annotations of perturbed genes)\tMean Posterior Probability\tStandard Deviation Posterior Probability\t"+("" if ps==[] else "p-value\t")+"Average Expression Level\tTerm Name" ) if self.show_genes_in_output: text.append("\tExplained Perturbed Genes") text.append("\n") out_list = [str(count+1) + "\t" + str(i[2]) + "\t" + str(i[2]-len(set(self.T[i[0]])&set(self.G[len(self.G)-1]))) for (count,i) in zip(range(len(enriched)),enriched)] if avg_posteriors!=[]: for i in range(len(out_list)): out_list[i]+="\t"+str(round(avg_posteriors[i],5))+"\t"+str(round(std_posteriors[i],5)) if self.show_genes_in_output: ind=sorted(range(len(self.levels)), reverse=True, key=lambda k: self.levels[k]) ind=ind[:self.nr_perturbed] for i in range(len(mean_gene_level)): if ps!=[]: out_list[i] += "\t" + str("%.3e" % ps[i]) out_list[i]+="\t" + str(round(mean_gene_level[i],3)) + "\t" + str(enriched[i][1]) if self.show_genes_in_output: out_list[i]+='\t' for j in ind: if j in self.T[self.terms[i]]: out_list[i]+=self.unique_genes[j]+" " out_list[i]=out_list[i][:-1] out_list[i]+="\n" text.append(out_list[i]) f_out.writelines(text) f_out.close() def determine_method(self, israndom): if israndom>=1: IS="a randomly chosen set of "+str(min(israndom,len(self.T)))+" processes" elif israndom>0: IS="a randomly chosen set of processes (each process is in the set with probability p="+ str(israndom)+")" else: IS="an empty set" text="MCMC" if self.proportion_parameter_change>0: text+=" with parameter learning" text+=" with "+IS+" as initial set" return [text,israndom] ##################################### ### Main Function of this Class ### ##################################### def runMe(self, verbose=False, israndom=0): start=time.time() self.get_sets_and_genes() if verbose: print('Checkpoint A, passed time',time.time()-start) self.load_all() if verbose: print('Checkpoint B, passed time',time.time()-start) self.glist=[self.genes_list.index(self.unique_genes[i]) for i in range(len(self.unique_genes))] self.levels=[self.level_list[self.glist[i]] for i in range(len(self.unique_genes))] if self.threshold_type=='proportion': self.threshold_for_activation=self.find_min_level_of_activation(self.threshold) else: self.threshold_for_activation=self.threshold dummy=self.glist[:] dummy.sort() self.levels_ordinal = [dummy.index(g) for g in self.glist] self.Tset=set(range(len(self.T))) self.number_of_genes = len(self.unique_genes) if verbose: print('Checkpoint C, passed time',time.time()-start) self.getG() IS = [] if israndom>0: lenT = len(self.T) if israndom>=1: IS = random.sample(range(lenT), int(min(lenT,israndom))) else: for i in range(lenT): if random.random()0: hist_alphas,hist_betas,hist_probs=[],[],[] max_scores,posteriors=[],[] for i in range(self.repeats): (max_score, MCMC_Distr, learned_params, hist_alpha, hist_beta, hist_prob)=self.MCMC_alg(IS=IS,verbose = 1 if verbose else -1) max_scores.append(max_score) posteriors.append(MCMC_Distr) if self.proportion_parameter_change>0: hist_alphas.append(hist_alpha[0]) hist_betas.append(hist_beta[0]) hist_probs.append(hist_prob[0]) if i==0: hist_alphas_choices=hist_alpha[1] hist_probs_choices=hist_prob[1] if self.proportion_parameter_change>0: mean_hist_alphas=[sum([hist_alphas[j][i] for j in range(self.repeats)])*1./self.repeats for i in range(len(hist_alphas[0]))] mean_hist_betas=[sum([hist_betas[j][i] for j in range(self.repeats)])*1./self.repeats for i in range(len(hist_betas[0]))] mean_hist_probs=[sum([hist_probs[j][i] for j in range(self.repeats)])*1./self.repeats for i in range(len(hist_probs[0]))] self.alpha=hist_alphas_choices[mean_hist_alphas.index(max(mean_hist_alphas))] self.beta=hist_alphas_choices[mean_hist_betas.index(max(mean_hist_betas))] self.prob=hist_probs_choices[mean_hist_probs.index(max(mean_hist_probs))]*1./len(self.T) avg_posteriors=[mean([posteriors[i][j] for i in range(self.repeats)]) for j in range(len(self.T))] std_posteriors=[std([posteriors[i][j] for i in range(self.repeats)]) for j in range(len(self.T))] self.terms=sorted(range(len(self.term_count)), reverse=True, key=lambda k: avg_posteriors[k]) avg_posteriors=[avg_posteriors[i] for i in self.terms] std_posteriors=[std_posteriors[i] for i in self.terms] enriched = [] for i in range(len(self.terms)): if avg_posteriors[i]<1./10 and i>20 or i>100: #output between 20 and 100 terms, stop when the posterior falls below 0.1 break enriched.append((self.terms[i], self.term_names[self.terms[i]], len(self.T[self.terms[i]]))) self.terms=self.terms[:len(enriched)] #so that terms that have a posterior below 0.1 get cut out of C if verbose: print('Checkpoint E, passed time',time.time()-start) ps = self.p_values_of_list(self.terms) mean_gene_level=self.mean_gene_level(self.terms,self.G,self.T,self.levels) method=self.determine_method(israndom) filename=self.get_filename(method,LONG=True) self.writeOut(filename, IS, mean(max_scores), std(max_scores), enriched, ps, mean_gene_level, method, avg_posteriors, std_posteriors) if self.proportion_parameter_change>0: self.writeOut_MCMC(filename, learned_params, [mean_hist_alphas,hist_alphas_choices],[mean_hist_betas,hist_alphas_choices],[mean_hist_probs,hist_probs_choices]) if verbose: print('Checkpoint F, passed time',time.time()-start) return (self.terms,avg_posteriors[:len(enriched)],std_posteriors[:len(enriched)]) ############################################### ### Command Line Support via main function ### ############################################### def main(): ''' This kicks off the process and parses the options from the arguments. ''' p = optparse.OptionParser('This program implements the functional enrichment method CRFE, described in C Kadelka, M Brandon, TM Murali, Concise Functional Enrichment of Ranked Gene Lists. See www.math.vt.edu/people/claus89/crfe/ for help.') p.add_option('--annotations_file', '-a', default='human-taxon-id-gene2go-with-annotation-status-closed-bioprocess-only.csv', help='csv file for the go terms') p.add_option('--gene_file', '-g', default='bkz_gene_list.csv', help='file string for a ranked list of genes') p.add_option('--gene_file_use', '-G', default='0', help='how should the gene file be used, 0=normal, 1=inverted, 2=absolute value') p.add_option('--threshold', '-t', default='0.3', help='Minimal expression/prediction level for a gene to be counted in one of active categories (default 0.3)') p.add_option('--threshold_type', '-T', default='value', help='Whether threshold should be interpreted as a value or proportion of perturbed genes (default proportion)') p.add_option('--lower_cutoff', '-c', default ='20', help='only GO terms above this cutoff are considered (default 5)') p.add_option('--upper_cutoff', '-C', default ='200', help='only GO terms below this cutoff are considered (default 200)') p.add_option('--nr_categories', '-k', default='0', help='Number of different categories of perturbed genes. If k=0, each gene is in its own category.') p.add_option('--belief', '-b', default='5', help='Belief for how much more active the highest gene set is compared to the least active gene set (default 5)') p.add_option('--repeats', '-n', default='1', help='Number of times CRFE is run (default 1).') p.add_option('--burnin_steps', '-s', default = '100000', help='number of steps used to initialize the Markov chain in MCMC (default 100000)') p.add_option('--MCMC_steps', '-S', default = '1000000', help='number of recorded steps in MCMC after burnin period (default 1000000)') p.add_option('--alpha_beta_max', '-x', default = '0.5', help='The size of the discrete space of values which alpha and beta may take on (default 0.5') p.add_option('--diff_alpha_beta', '-X', default = '20', help='The size of the discrete space of values which alpha and beta may take on (default 20') p.add_option('--diff_q', '-Q', default = '20', help='The size of the discrete space of values which prob can take on (default 20)') p.add_option('--probability_parameter_change', '-p', default = '0.2', help='Proportion of time the parameter is changed in MCMC (default 0.2') p.add_option('--random_initial_set','-r', default='0', help='When nonzero, the algorithm does not start with an empty set but r processes if r>=1, if 01: proportion_parameter_change=0.2 elif proportion_parameter_change<=0: proportion_parameter_change=0.01 except ValueError: proportion_parameter_change = 0.2 try: A = int(options.diff_alpha_beta) except ValueError: A = 20 try: P = int(options.diff_q) except ValueError: P = 20 try: israndom = float(options.random_initial_set) if israndom>=1: israndom=int(israndom) if israndom<=0: israndom=0 except ValueError: israndom = 0 try: seed = int(options.seed) except ValueError: seed = -1 try: gene_file_use = int(options.gene_file_use) except ValueError: gene_file_use = 0 gene_file = options.gene_file annotations_file = options.annotations_file out_str = options.identifier output_folder = options.output_folder myGOAlgo = CRFE(repeats, nr_categories, lower_cutoff, upper_cutoff, belief, threshold, threshold_type, burnin_steps, MCMC_steps, alpha_beta_max, proportion_parameter_change, A, P, gene_file, gene_file_use, annotations_file, output_folder, out_str, seed) myGOAlgo.runMe(verbose,israndom) ##Main Part if __name__ == '__main__': main() #lower_cutoff=4 #upper_cutoff=0 #nr_categories=0 #belief=5 #burnin=1000 #steps=5000 #repeats=3 # #o = CRFE(repeats, nr_categories, lower_cutoff, upper_cutoff, 5, # 0.4, 'proportion', burnin, steps, 0.5,0.2,19,20, 'doc/sample_gene_file.txt',0, # 'doc/sample_annotation_file.txt', # 'output/', 'SAMPLE', seed=0) # #(C,avgs,stds)=o.runMe(verbose=1, israndom=0) #print(C,avgs,stds)