1010import subprocess
1111
1212from encode_lib_common import (
13+ get_gnu_sort_param ,
1314 get_num_lines ,
1415 get_peak_type ,
1516 human_readable_number ,
@@ -324,7 +325,7 @@ def subsample_ta_pe(ta, subsample, non_mito, mito_chr_name, r1_only, out_dir):
324325# convert encode peak file to hammock (for Wash U browser track)
325326
326327
327- def peak_to_hammock (peak , out_dir ):
328+ def peak_to_hammock (peak , mem_gb , out_dir ):
328329 peak_type = get_peak_type (peak )
329330 prefix = os .path .join (out_dir , os .path .basename (
330331 strip_ext_peak (peak )))
@@ -335,14 +336,26 @@ def peak_to_hammock(peak, out_dir):
335336 hammock_gz_tbi = '{}.gz.tbi' .format (hammock )
336337
337338 if get_num_lines (peak ) == 0 :
338- cmd = 'zcat -f {} | gzip -nc > {}' .format (peak , hammock_gz )
339- run_shell_cmd (cmd )
340- cmd2 = 'touch {}' .format (hammock_gz_tbi )
339+ run_shell_cmd (
340+ 'zcat -f {peak} | gzip -nc > {hammock_gz}' .format (
341+ peak = peak ,
342+ hammock_gz = hammock_gz ,
343+ )
344+ )
345+ run_shell_cmd (
346+ 'touch {hammock_gz_tbi}' .format (
347+ hammock_gz_tbi = hammock_gz_tbi ,
348+ )
349+ )
341350 else :
342- cmd = "zcat -f {} | "
343- cmd += "LC_COLLATE=C sort -k1,1V -k2,2n > {}"
344- cmd = cmd .format (peak , hammock_tmp )
345- run_shell_cmd (cmd )
351+ run_shell_cmd (
352+ 'zcat -f {peak} | '
353+ 'LC_COLLATE=C sort -k1,1V -k2,2n {sort_param} > {hammock_tmp}' .format (
354+ peak = peak ,
355+ sort_param = get_gnu_sort_param (mem_gb * 1024 ** 3 , ratio = 0.5 ),
356+ hammock_tmp = hammock_tmp ,
357+ )
358+ )
346359
347360 with open (hammock_tmp , 'r' ) as fin , open (hammock_tmp2 , 'w' ) as fout :
348361 id = 1
@@ -390,13 +403,22 @@ def peak_to_hammock(peak, out_dir):
390403
391404 fout .write ('\n ' )
392405
393- cmd2 = 'zcat -f {} | sort -k1,1 -k2,2n | bgzip -cf > {}'
394- cmd2 = cmd2 .format (hammock_tmp2 , hammock_gz )
395- run_shell_cmd (cmd2 )
396- cmd3 = 'tabix -f -p bed {}' .format (hammock_gz )
397- run_shell_cmd (cmd3 )
406+ run_shell_cmd (
407+ 'zcat -f {hammock_tmp2} | sort -k1,1 -k2,2n {sort_param} | '
408+ 'bgzip -cf > {hammock_gz}' .format (
409+ hammock_tmp2 = hammock_tmp2 ,
410+ sort_param = get_gnu_sort_param (mem_gb * 1024 ** 3 , ratio = 0.5 ),
411+ hammock_gz = hammock_gz ,
412+ )
413+ )
414+ run_shell_cmd (
415+ 'tabix -f -p bed {hammock_gz}' .format (
416+ hammock_gz = hammock_gz ,
417+ )
418+ )
398419
399420 rm_f ([hammock , hammock_tmp , hammock_tmp2 ])
421+
400422 return (hammock_gz , hammock_gz_tbi )
401423
402424
@@ -435,7 +457,7 @@ def starch_to_bed_gz(starch, out_dir):
435457 return bed_gz
436458
437459
438- def peak_to_bigbed (peak , peak_type , chrsz , out_dir ):
460+ def peak_to_bigbed (peak , peak_type , chrsz , mem_gb , out_dir ):
439461 prefix = os .path .join (out_dir ,
440462 os .path .basename (strip_ext (peak )))
441463 bigbed = '{}.{}.bb' .format (prefix , peak_type )
@@ -460,7 +482,7 @@ def peak_to_bigbed(peak, peak_type, chrsz, out_dir):
460482 int peak; "Point-source called for this peak; 0-based offset from chromStart. Set to -1 if no point-source called."
461483)
462484'''
463- bed_param = '-type=bed6+4 -as={}' .format (as_file )
485+ bed_param = '-type=bed6+4 -as={as_file }' .format (as_file = as_file )
464486 elif peak_type .lower () == 'broadpeak' :
465487 as_file_contents = '''table broadPeak
466488"BED6+3 Peaks of signal enrichment based on pooled, normalized (interpreted) data."
@@ -476,7 +498,7 @@ def peak_to_bigbed(peak, peak_type, chrsz, out_dir):
476498 float qValue; "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used."
477499)
478500'''
479- bed_param = '-type=bed6+3 -as={}' .format (as_file )
501+ bed_param = '-type=bed6+3 -as={as_file }' .format (as_file = as_file )
480502 elif peak_type .lower () == 'gappedpeak' :
481503 as_file_contents = '''table gappedPeak
482504"This format is used to provide called regions of signal enrichment based on pooled, normalized (interpreted) data where the regions may be spliced or incorporate gaps in the genomic sequence. It is a BED12+3 format."
@@ -498,26 +520,47 @@ def peak_to_bigbed(peak, peak_type, chrsz, out_dir):
498520 float qValue; "Statistical significance with multiple-test correction applied (FDR). Set to -1 if not used."
499521)
500522'''
501- bed_param = '-type=bed12+3 -as={}' .format (as_file )
523+ bed_param = '-type=bed12+3 -as={as_file }' .format (as_file = as_file )
502524 else :
503- raise Exception ('Unsupported peak file type {}!' .format (peak_type ))
525+ raise Exception ('Unsupported peak file type {peak_type }!' .format (peak_type = peak_type ))
504526
505527 # create temporary .as file
506528 with open (as_file , 'w' ) as fp :
507529 fp .write (as_file_contents )
508530
509- cmd1 = "cat {} > {}" .format (chrsz , chrsz_tmp )
510- run_shell_cmd (cmd1 )
511- cmd2 = "zcat -f {} | LC_COLLATE=C sort -k1,1 -k2,2n | "
512- cmd2 += 'awk \' BEGIN{{OFS="\\ t"}} {{if ($5>1000) $5=1000; '
513- cmd2 += 'if ($5<0) $5=0; print $0}}\' > {}'
514- cmd2 = cmd2 .format (peak , bigbed_tmp )
515- run_shell_cmd (cmd2 )
516- cmd3 = "bedClip {} {} {}" .format (bigbed_tmp , chrsz_tmp , bigbed_tmp2 )
517- run_shell_cmd (cmd3 )
518- cmd4 = "bedToBigBed {} {} {} {}" .format (
519- bed_param , bigbed_tmp2 , chrsz_tmp , bigbed )
520- run_shell_cmd (cmd4 )
531+ run_shell_cmd (
532+ 'cat {chrsz} > {chrsz_tmp}' .format (
533+ chrsz = chrsz ,
534+ chrsz_tmp = chrsz_tmp ,
535+ )
536+ )
537+
538+ run_shell_cmd (
539+ 'zcat -f {peak} | LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} | '
540+ 'awk \' BEGIN{{OFS="\\ t"}} {{if ($5>1000) $5=1000; '
541+ 'if ($5<0) $5=0; print $0}}\' > {bigbed_tmp}' .format (
542+ peak = peak ,
543+ sort_param = get_gnu_sort_param (mem_gb * 1024 ** 3 , ratio = 0.5 ),
544+ bigbed_tmp = bigbed_tmp ,
545+ )
546+ )
547+
548+ run_shell_cmd (
549+ 'bedClip {bigbed_tmp} {chrsz_tmp} {bigbed_tmp2}' .format (
550+ bigbed_tmp = bigbed_tmp ,
551+ chrsz_tmp = chrsz_tmp ,
552+ bigbed_tmp2 = bigbed_tmp2 ,
553+ )
554+ )
555+
556+ run_shell_cmd (
557+ 'bedToBigBed {bed_param} {bigbed_tmp2} {chrsz_tmp} {bigbed}' .format (
558+ bed_param = bed_param ,
559+ bigbed_tmp2 = bigbed_tmp2 ,
560+ chrsz_tmp = chrsz_tmp ,
561+ bigbed = bigbed ,
562+ )
563+ )
521564
522565 # remove temporary files
523566 rm_f ([as_file , chrsz_tmp , bigbed_tmp , bigbed_tmp2 ])
0 commit comments