Commit be8d46b9 authored by Daniel Doerr's avatar Daniel Doerr
Browse files
parents d5cc1147 ea80e6dc
#!/usr/bin/env python2
import xml.etree.ElementTree as ET
from ilp_util import *
from argparse import ArgumentParser
import sys
GENE = 0
GENOME = 1
EXTREMITY = 2
DELETE = -1
def insert(pos, ins, d):
''' Insert gene ins at position pos into dictionary d, while dealing with the
edge cases of circular singletons etc.
'''
if d.get(pos, ins) == ins:
d[pos] = ins
return
#spurious case of circular singleton
if d[pos]==DELETE:
d[pos] = ins
return
if ins == DELETE:
return
raise Exception, "Inconsistent markers: %s assigned to %s and %s"%(pos, d[pos], ins)
def assign_markers(sol_filename):
''' Extract the matching as well as some distance stats from the given
gurobi solution file.
'''
max_val = None
assign = {}
vertices = 0
indels = 0
telomeres = 0
z_sum = 0
jumps = 0
with open(sol_filename) as fl:
for line in fl:
if line.startswith("# Objective value"):
if max_val != None:
print("Double maximum values!!!")
max_val = int(line.split("=")[1])
continue
if line.startswith("#"):
continue
nv = line.split(" ")
name = nv[0]
val = nv[1]
if name.startswith('z'):
z_sum+=round(float(val))
if name.startswith('delta_r'):
jumps+=round(float(val))
if name.startswith('x') and (round(float(val))==1):
exts = name.split('(')[1].split(')')[0].split(',')
#print(exts)
v1 = exts[0].split('_')
v2 = exts[1].split('_')
if v1[EXTREMITY] == EXTREMITY_TELOMERE and v1[GENOME] != v2[GENOME]:
telomeres+=2
if v1[GENE] == v2[GENE] :
insert(v1[GENE],DELETE,assign)
indels+=1
if v1[GENOME] != v2[GENOME]:
vertices+=2
insert(v1[GENE],v2[GENE],assign)
insert(v2[GENE],v1[GENE],assign)
return assign, max_val,z_sum, jumps, vertices, indels, telomeres, "N/A"
def rename_genes_g(genomes, assgns, gen):
''' Rename the uniquely labeled genes of both genomes by new identifiers
resulting from the matching (assgns).
'''
new_ids = {}
return map(lambda (name, chrs): (name, rename_genes_c(chrs, assgns, new_ids, gen)), genomes)
def rename_genes_c(chrs, assgns, new_ids, gen):
''' Rename the uniquely labeled genes of one genome by new identifiers
resulting from the matching (assgns). new_ids are the identifiers
already in use from possibly the other genome.
'''
return map(lambda (kind, chr): (kind, rename_genes(chr, assgns, new_ids, gen)), chrs)
def rename_genes(chr, assgns, new_ids, gen):
''' Rename the uniquely labeled genes of a chromosome by new identifiers
resulting from the matching (assgns). new_ids are the identifiers
already in use from possibly another chromosome.
'''
ret = []
for orient, id in chr:
if not id in assgns:
raise Exception, "No assignment for Gene %s made!"%id
other = assgns[id]
if other == DELETE:
ret.append((orient,'x_%d'%gen.get_new()))
elif other in new_ids:
ret.append((orient, new_ids[other]))
else:
#print(new_ids,id)
new_ids[id] = gen.get_new()
ret.append((orient, new_ids[id]))
return ret
def main():
parser = ArgumentParser(description= 'Read a gurobi solution as well as the uniquely labeled UniMoG file to compute a distance output and equivalent genomes, which do not contain duplicates.')
parser.add_argument('-i', nargs=1, required=True, help='gurobi solution file with a single solution to read from.')
parser.add_argument('-u', nargs=1, required=True, help='UniMoG unique Id file to read genomes from.')
parser.add_argument('-o', nargs=1, required=True, help='Output file for new, equivalent (according to the gurobi opimization performed) genomes without duplications.')
parser.add_argument('-d', nargs=1, help='Distance output file')
parser.add_argument('-g', nargs=1, type=int, help='Assumed number of Non-Deletion Markers for computing the DCJ distance if the number of genes shall not be deduced from the graph.')
parser.add_argument('--noheader', action="store_true", help="Repress print of explanatory header.")
args = parser.parse_args()
assgns, max_val, z_sum, jumps, vertices, indels, telomeres, status = assign_markers(args.i[0])
#print(assgns)
genes = vertices/4
if args.g:
genes = args.g[0]
distance = genes - max_val/2
genomes = []
with open(args.u[0]) as genome_file:
genomes = readGenomes(genome_file)
gen = Simple_Id_Generator()
genomes = rename_genes_g(genomes, assgns, gen)
with open(args.o[0], "w") as outfile:
print_id_genome_unimog(genomes[0],outfile,id_pos=1)
print_id_genome_unimog(genomes[1],outfile,id_pos=1)
if args.d:
distfile = open(args.d[0], "w")
else:
distfile = sys.stdout
if not args.noheader:
distfile.write("Solution-Type\t#Non-Deletion-Markers(+Telomere-Pairs)\tDistance\n")
distfile.write("%s\t%d\t%d\n"%(status,genes,distance))
main()
#!/usr/bin/env python
#!/usr/bin/env python2
import sys
from ilp_util import *
from argparse import ArgumentParser
......@@ -130,9 +130,6 @@ def add_to_list_dict(d, key, elem):
else:
d[key] = [elem]
def get_cross_genome_edges(v1, v2):
''' Generate the matching edges between the vertices v1,v2 of genomes 1 and 2
in the modified adjacency graph.
......@@ -140,22 +137,20 @@ def get_cross_genome_edges(v1, v2):
v1g = vertices_by_genes(v1)
v2g = vertices_by_genes(v2)
edges = []
duplicate_fams = 0
max_fam = 0
duplicates = 0
for gene_id, vertices in v1g.iteritems():
if gene_id in v2g:
if len(vertices) > 1 or len(v2g[gene_id]) > 1:
duplicates+=1
duplicate_fams+=1
d = max(len(vertices),len(v2g[gene_id]))
max_fam = max(max_fam, d)
duplicates+=d
edges.extend([g_Edge(u,v) for u in vertices if gene_id in v2g for v in v2g[gene_id]])
#print(v1g.get((TELOMERE_ID,EXTREMITY_TELOMERE),[]))
#print(v2g.get((TELOMERE_ID,EXTREMITY_TELOMERE),[]))
#add intra-genome telomer-telomer edges
#edges.extend([g_Edge(u,v) for u in v1g.get((TELOMERE_ID,EXTREMITY_TELOMERE),[])
# for v in v1g.get((TELOMERE_ID,EXTREMITY_TELOMERE),[])
# if u.vertex_id < v.vertex_id])
#edges.extend([g_Edge(u,v) for u in v2g.get((TELOMERE_ID,EXTREMITY_TELOMERE),[])
# for v in v2g.get((TELOMERE_ID,EXTREMITY_TELOMERE),[])
# if u.vertex_id < v.vertex_id])
LOG.info('Number of duplicates is: %d\n'%duplicates)
LOG.info('Number of duplicates is: %d\n'%int(duplicates/2))
LOG.info('Number of duplicate families: %d\n'%int(duplicate_fams/2))
LOG.info('Greatest family is: %d\n'%max_fam)
return edges
def create_adjacency_graph(genome1, genome2, id_generator):
......@@ -464,7 +459,7 @@ def smd(kx):
if x == CONST:
x=""
kk="%d"%k
return "%s%s"%(kk,x)
return "%s %s"%(kk,x)
def sum_str(ls):
''' Express a list of constant, variable pairs (n,x_i) as the string of a
......@@ -568,16 +563,25 @@ def singleton_constraints(genomes, ilp, edges):
return ilp
def main():
parser = ArgumentParser(description='Read and id an unimog file with two genomes and compute the corresponding DING ILP.')
epilog = "This script assumes the Unimog file has exactly two genomes. For selecting two genomes in an unimog file containing multiple ones (with unique names), both options -1/--genome1 and -2/--genome2 must be used."
parser = ArgumentParser(description='Read and id an unimog file with two genomes and compute the corresponding DING ILP.', epilog=epilog)
parser.add_argument('-i', nargs=1, required=True, help='Input File with exactly two genomes (labeled "A","B") in Unimog format.')
parser.add_argument('-u', nargs=1, required=True, help='Output File for uniquely labelled occurrences in Unimog format.')
parser.add_argument('-o', nargs=1, required=True, help='Output File for the resulting ILP in CPLEX lp format.')
parser.add_argument('--opt', nargs=1, choices=('s','mm','all','none'), help=argparse.SUPPRESS, default='all' )
parser.add_argument('-1', '--genome1', dest='genome1', metavar='A', type=str, required=False,
help='Name of the first genome you want to compare (must be one of the genomes in the Unimog file).')
parser.add_argument('-2', '--genome2', dest='genome2', metavar='B', type=str, required=False,
help='Name of the second genome you want to compare (must be one of the genomes in the Unimog file).')
parser.add_argument('--opt', nargs=1, choices=('s','mm','all','none'), help=argparse.SUPPRESS, default=['all'] )
parser.add_argument('-l',nargs=1, help='Log-file')
parser.add_argument('-v', action="store_true", help="Be verbose.")
parser.add_argument('--ignore-circular-singletons', action="store_true", help="Use only if ABSOLUTELY certain, that the matching solution will not contain circular singletons. Else distances and matchings computed using this option could be corrupted.")
args = parser.parse_args()
if (not args.genome1 and args.genome2) or (args.genome1 and not args.genome2):
raise Exception, "Please provide both both options -1/--genome1 and -2/--genome2 or none of them!"
st = logging.StreamHandler(sys.stderr)
if args.l:
st = logging.FileHandler(args.l[0])
......@@ -596,9 +600,15 @@ def main():
LOG.warning("'--opt' is a deprecated parameter and to be used for testing purposes only. Anything other than 'mm' or 'all' will not apply the Maximum Matching model.")
with open(args.i[0]) as file:
data = file.readlines()
genomes = readGenomes(data)
if args.genome1:
genomes = readGenomes(data, genomesOnly=(args.genome1, args.genome2))
if args.genome1 != genomes[0][0]: genomes.reverse()
else:
genomes = readGenomes(data)
if len(genomes) !=2:
raise Exception, "Please provide exactly two genomes in the unimog file!"
raise Exception, "Please provide exactly two genomes in the Unimog file or use the options -1/--genome1 and -2/--genome2!"
genomes[0] = (genome1, genomes[0][1])
genomes[1] = (genome2, genomes[1][1])
gen = Simple_Id_Generator()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment