|
17 | 17 | tag = args.tag
|
18 | 18 | cwd = os.getcwd()
|
19 | 19 |
|
20 |
| -lines_per_file = 25000 |
21 |
| -smallfile = None |
22 |
| -with open(f'all_splicing_variants_{tag}.bed', 'r') as bigfile: |
23 |
| - header = bigfile.readline() |
24 |
| - for lineno, line in enumerate(bigfile): |
25 |
| - if lineno % lines_per_file == 0: |
26 |
| - if smallfile: |
27 |
| - smallfile.close() |
28 |
| - small_filename = 'small_file_{}.txt'.format(lineno + lines_per_file) |
29 |
| - smallfile = open(small_filename, "w") |
30 |
| - smallfile.write(header) |
| 20 | +target_lines_per_file = 25000 |
| 21 | +lines_per_file = 0 |
| 22 | +input_file = f'all_splicing_variants_{tag}.bed' |
| 23 | +lines = open(input_file).readlines() |
| 24 | +count = len(lines) |
| 25 | +if count <= lines_per_file: |
| 26 | + subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {input_file}') |
| 27 | +else: |
| 28 | + header = lines[0] |
| 29 | + lines.pop(0) |
| 30 | + lines.sort() |
| 31 | + filenum = 1 |
| 32 | + small_filename = f'small_file_{filenum}.txt' |
| 33 | + smallfile = open(small_filename, "w") |
| 34 | + smallfile.write(header) |
| 35 | + lines_per_file += target_lines_per_file |
| 36 | + for lineno, line in enumerate(lines): |
31 | 37 | smallfile.write(line)
|
32 |
| - if smallfile: |
33 |
| - smallfile.close() |
34 |
| -#get chunks |
| 38 | + if lineno >= lines_per_file: |
| 39 | + fields1 = line.split('\t') |
| 40 | + variant1 = f'{fields1[0]}_{fields1[1]}_{fields1[2]}' |
| 41 | + fields2 = lines[lineno+1].split('\t') |
| 42 | + variant2 = f'{fields2[0]}_{fields2[1]}_{fields2[2]}' |
| 43 | + if variant1 != variant2: |
| 44 | + smallfile.close() |
| 45 | + filenum += 1 |
| 46 | + small_filename = f'small_file_{filenum}.txt' |
| 47 | + smallfile = open(small_filename, "w") |
| 48 | + smallfile.write(header) |
| 49 | + lines_per_file += target_lines_per_file |
| 50 | +# get chunks |
35 | 51 | files = glob.glob('small_file_*')
|
36 | 52 | files.sort()
|
37 | 53 | number_of_in_files = len(files)
|
38 | 54 | for file in files:
|
39 |
| - subprocess.run(f'Rscript --vanilla compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True) |
| 55 | + subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True) |
40 | 56 | output_files = glob.glob("*_out.tsv")
|
41 | 57 | output_files.sort()# glob lacks reliable ordering, so impose your own if output order matters
|
42 | 58 | number_of_out_files = len(output_files)
|
|
53 | 69 | print("Number of output files doesn't match the number of input files that should have been processed")
|
54 | 70 | files = glob.glob('small_file_*')
|
55 | 71 | for file in files:
|
56 |
| - os.remove(file) |
57 |
| - |
| 72 | + os.remove(file) |
0 commit comments