Skip to content

Commit d8232ea

Browse files
author
zhangrengang
committed
support more cmap; check mcl
1 parent 29def7f commit d8232ea

File tree

3 files changed

+74
-29
lines changed

3 files changed

+74
-29
lines changed

SOI-tools.md

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,17 @@ For example, `CHR_Ae11-Ae15-Ca2-Ca7-Vv1_143_283.concat.treefile`, `Ae11-Ae15-Ca2
2727

2828
#### Allele identification ####
2929
For diploid and polyploid assembly, we may aim to identify allele genes. As allele genes are not always syntenic (e.g., non-syntenic due to samll-scale inversion), we combine synteny and orthology to identify a full set of allele genes.
30+
3031
After `soi filter` has identified orthologous synteny in the [example pipeline](https://github.com/zhangrengang/evolution_example), we run
31-
`
32+
```
3233
soi-syn retrieve_allele collinearity.ortho ../OrthoFinder/OrthoFinder/Results_*/ ../all_species_gene.gff sps="Panax_ginseng Panax_notoginseng" min_block=10 win_size=10 > allele.txt
33-
`
34-
`sps="Panax_ginseng Panax_notoginseng"` sets one or more target species or pseudo-species; species labels should be different for different subgenomes (label gene ID like `SP1|sgA.g1, SP1|sgB.g1`);
35-
`min_block=10` sets minimum length of syntenic blocks;
36-
`win_size=10` sets upstream and downstream 10 genes to retrieve orthologs using syntenic genes as anchors.
34+
```
35+
`sps="Panax_ginseng Panax_notoginseng"` sets one or more target species or pseudo-species (space seperated);
36+
species labels need to be different for different subgenomes (label gene ID like `SP1|sgA.g1`, `SP1|sgB.g1`);
37+
38+
`min_block=10` sets minimum length of syntenic blocks; increasing the value will result in more reliable alleles;
39+
40+
`win_size=10` sets upstream and downstream 10 genes to retrieve orthologs using syntenic genes as anchors; increasing the value will retrieve more alleles.
3741

3842
The output file is like:
3943
```
@@ -55,7 +59,7 @@ chrom idx Panax_ginseng Panax_notoginseng Panax_ginseng Panax_no
5559
```
5660
The 3rd and 4th columns indicate allelic gene pairs.
5761
The last column indicates the source of alleles; for example, `orthology:15422` means ortholog #15422,
58-
`synteny-1:51` means from syntenic block #1 and ortholog #51; `synteny-1:None` means that it is not pre-inferred as an ortholog.
62+
`synteny-1:51` means it is from syntenic block #1 and ortholog #51; `synteny-1:None` means that it is not pre-inferred as an ortholog.
5963
The column number will extend for polyploids.
6064

6165
#### Orthology format conversion ####

soi/dot_plotter.py

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,33 @@
1717
from .RunCmdsMP import logger
1818
from .WGDI import AK
1919
from .ploidy_plotter import add_ploidy_opts, get_ploidy, plot_bars
20-
from .mcscan import Collinearity, Gff, XCollinearity
20+
from .mcscan import Collinearity, XGff, XCollinearity
21+
22+
cmaps = {
23+
'viridis', 'plasma', 'inferno', 'magma', 'cividis',
24+
'Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds',
25+
'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu',
26+
'GnBu', 'PuBu', 'YlGnBu', 'PuBuGn', 'BuGn', 'YlGn',
27+
'binary', 'gist_yarg', 'gist_gray', 'gray', 'bone', 'pink',
28+
'spring', 'summer', 'autumn', 'winter', 'cool', 'Wistia',
29+
'hot', 'afmhot', 'gist_heat', 'copper',
30+
'PiYG', 'PRGn', 'BrBG', 'PuOr', 'RdGy', 'RdBu',
31+
'RdYlBu', 'RdYlGn', 'Spectral', 'coolwarm', 'bwr', 'seismic',
32+
'berlin', 'managua', 'vanimo',
33+
'twilight', 'twilight_shifted', 'hsv',
34+
'Pastel1', 'Pastel2', 'Paired', 'Accent',
35+
'Dark2', 'Set1', 'Set2', 'Set3',
36+
'tab10', 'tab20', 'tab20b', 'tab20c',
37+
'flag', 'prism', 'ocean', 'gist_earth', 'terrain', 'gist_stern',
38+
'gnuplot', 'gnuplot2', 'CMRmap', 'cubehelix', 'brg',
39+
'gist_rainbow', 'rainbow', 'jet', 'turbo', 'nipy_spectral',
40+
'gist_ncar'}
2141

2242

2343
def dotplot_args(parser):
2444
parser.add_argument('-s', metavar='FILE', type=str, required=True, nargs='+',
2545
help="syntenic block file (*.collinearity, output of MCSCANX/WGDI)[required]")
26-
parser.add_argument('-g', metavar='FILE', type=str, required=True,
46+
parser.add_argument('-g', metavar='FILE', type=str, required=True, nargs='+',
2747
help="gene annotation gff file (*.gff, one of MCSCANX/WGDI input)[required]")
2848
parser.add_argument('-c', metavar='FILE', type=str, required=True,
2949
help="chromosomes config file (*.ctl, same format as MCSCANX dotplotter)[required]")
@@ -99,8 +119,8 @@ def dotplot_args(parser):
99119
help="plot histogram or not [default=%(default)s]")
100120
group_ks.add_argument('--max-ks', metavar='Ks', type=float, default=1,
101121
help="max Ks (x limit) [default=%(default)s]")
102-
group_ks.add_argument('--ks-cmap', metavar='Ks', type=float, nargs='+', default=None,
103-
help="color map for Ks. [default=%(default)s]")
122+
group_ks.add_argument('--ks-cmap', metavar='Ks', nargs='+', default=None, # type=float,
123+
help="color map for Ks, format: `jet` or `0.2 0.6 1 ...`. [default=%(default)s]")
104124
group_ks.add_argument('--ks-step', metavar='Ks', type=float, default=0.02,
105125
help="Ks step of histogram [default=%(default)s]")
106126
group_ks.add_argument('--use-median', action='store_true', default=False,
@@ -327,7 +347,9 @@ def plot_blocks(blocks, outplots, ks=None, max_ks=None, ks_hist=False, ks_cmap=N
327347
min_ks = min([v for v in Ks if v >= 0])
328348
except ValueError: # ValueError: min() arg is an empty sequence
329349
min_ks = 0
330-
if ks_cmap:
350+
if ks_cmap and ks_cmap[0] in cmaps:
351+
cmap = ks_cmap[0]
352+
elif ks_cmap:
331353
cmap = create_ks_map(ks_cmap, min_ks, max_ks)
332354
else:
333355
cmap = cm.jet
@@ -649,6 +671,7 @@ def create_ks_map(ks_map, min_ks, max_ks):
649671
import numpy as np
650672
from matplotlib import cm
651673
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
674+
ks_map = list(map(float, ks_map))
652675
length = 256
653676
maps = _norm_map(ks_map, min_ks, max_ks, length)
654677
print(ks_map, min_ks, max_ks, maps)
@@ -683,7 +706,7 @@ def parse_gff(gff, chrs1, chrs2):
683706
coord_graph1 = nx.Graph()
684707
coord_graph2 = nx.Graph()
685708
d_gff = {}
686-
for line in Gff(gff):
709+
for line in XGff(gff):
687710
if not line.chrom in chrs:
688711
continue
689712
key = (line.species, line.chrom)

soi/mcscan.py

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ def _parse_list(self, _collinearities):
240240
def _parse(self):
241241
if self.orthologs is not None:
242242
ortholog_pairs = set(XOrthology(self.orthologs, **self.kargs))
243-
logger.info('\t{} homologous pairs'.format(len(ortholog_pairs)))
243+
# logger.info('\t{} homologous pairs'.format(len(ortholog_pairs)))
244244
logger.info('parsing synteny from {} collinearity files: {} ...'.format(
245245
len(self.collinearities), self.collinearities[:3]))
246246
nblock, ngene = 0, 0
@@ -328,8 +328,8 @@ def _parse_list(self, _orthologs):
328328
def _parse(self):
329329
'''yield Pair object'''
330330
i = 0
331+
logger.info('parsing orthology: {} ...'.format(self.orthologs))
331332
for ortholog in self.orthologs:
332-
logger.info('parsing orthology: {} ...'.format(ortholog))
333333
if os.path.isdir(ortholog):
334334
# SonicParanoid / OrthoFinder
335335
parser = lazy_orthologs(ortholog)
@@ -968,21 +968,23 @@ def parse_gff(self):
968968
genes = set([])
969969
d_chr = {}
970970
d_length = {}
971-
for line in open(self.gff):
972-
line = lazy_decode(line)
973-
temp = line.rstrip().split('\t')
974-
if not temp or line.startswith('#'):
975-
continue
976-
chr, gene, start, end = temp[:4]
977-
if gene in genes: # remove repeat
978-
continue
979-
genes.add(gene)
980-
try:
981-
strand = temp[4]
982-
except IndexError:
983-
strand = None
984-
start, end = list(map(int, [start, end]))
985-
g = Gene((gene, chr, start, end, strand))
971+
for line in XGff(self.gff):
972+
# line = lazy_decode(line)
973+
# temp = line.rstrip().split('\t')
974+
# if not temp or line.startswith('#'):
975+
# continue
976+
# chr, gene, start, end = temp[:4]
977+
# if gene in genes: # remove repeat
978+
# continue
979+
# genes.add(gene)
980+
# try:
981+
# strand = temp[4]
982+
# except IndexError:
983+
# strand = None
984+
# start, end = list(map(int, [start, end]))
985+
chr, gene, start, end, strand = \
986+
line.chrom, line.gene, line.start, line.end, line.strand
987+
g = line.Gene #((gene, chr, start, end, strand))
986988
try:
987989
d_chr[chr] += [g]
988990
except KeyError:
@@ -1088,6 +1090,18 @@ def anchors2bed(collinearity, gff, chrmap, left_anchors, right_anchors, outbed=s
10881090
print('\t'.join(line), file=outbed)
10891091

10901092

1093+
class XGff(XOrthology):
1094+
def __init__(self, gffs):
1095+
self.gffs = self._parse_list(gffs)
1096+
def __iter__(self):
1097+
return self._parse()
1098+
def _parse(self):
1099+
logger.info('parsing gff files: {} ...'.format(self.gffs))
1100+
for gff in self.gffs:
1101+
for line in Gff(gff):
1102+
yield line
1103+
1104+
10911105
class Gff:
10921106
'''Gff parser'''
10931107

@@ -1853,6 +1867,7 @@ def parse_group(groups):
18531867

18541868
def cluster_by_mcl(collinearities, orthologs=None, inflation=2,
18551869
outgroup=None, ingroup=None, outpre='cluster'):
1870+
check_cmd('mcl')
18561871
ingroup = set(parse_group(ingroup))
18571872
outgroup = set(parse_group(outgroup))
18581873
logger.info('outgroup: {}'.format(outgroup))
@@ -1890,6 +1905,9 @@ def cluster_by_mcl(collinearities, orthologs=None, inflation=2,
18901905
nc = len([1 for line in open(cluster)])
18911906
logger.info('{} syntenic gene pairs reslut in {} SOGs'.format(np, nc))
18921907

1908+
def check_cmd(cmd):
1909+
logger.info('checking `{}`'.format(cmd))
1910+
run_cmd(cmd, log=True, )
18931911

18941912
def test_closest(collinearity, kaks, spsd, min_size=0):
18951913
ColinearGroups(collinearity, spsd, kaks=kaks,

0 commit comments

Comments
 (0)