Skip to content

Commit 8e265ae

Browse files
c-dilksbaltzell
authored andcommitted
feat: merge a set of HIPO files to a smaller set
This tool merges a set of input HIPO files to a set of output HIPO files, where you may control the number of input files per output file; for example, use this tool if you have 1000 small HIPO files but would rather have 10 large HIPO files.
1 parent d1f48d3 commit 8e265ae

File tree

1 file changed

+107
-0
lines changed

1 file changed

+107
-0
lines changed

bin/hipo-multi-merge

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
#!/usr/bin/env ruby
2+
3+
require 'optparse'
4+
require 'ostruct'
5+
require 'fileutils'
6+
7+
def print_log(name, val)
8+
puts name.rjust(30) + " = #{val}"
9+
end
10+
11+
# user options
12+
@args = OpenStruct.new
13+
@args.inputs = nil
14+
@args.output_dir = nil
15+
@args.prefix = nil
16+
@args.num_merge = nil
17+
@args.use_batch = false
18+
@args.dry_run = false
19+
OptionParser.new do |o|
20+
o.banner = '''
21+
This tool merges a set of input HIPO files to a set of output HIPO files,
22+
where you may control the number of input files per output file; for example,
23+
use this tool if you have 1000 small HIPO files but would rather have 10
24+
large HIPO files.
25+
'''
26+
o.separator "USAGE: #{$0} [OPTIONS]..."
27+
o.separator ''
28+
o.separator 'REQUIRED OPTIONS:'
29+
o.on('-i', '--input INPUTS', 'input directory or file glob;', 'surround file glob in quotes') { |a| @args.inputs = a }
30+
o.on('-o', '--output OUTPUT_DIR', 'output directory') { |a| @args.output_dir = a }
31+
o.on('-p', '--prefix OUTPUT_PREFIX', 'output filename prefix; names will be:', ' [OUTPUT_DIR]/[OUTPUT_PREFIX]_#####.hipo') { |a| @args.prefix = a }
32+
o.on('-n', '--num NUM_FILES', 'number of files per output merged file') { |a| @args.num_merge = a.to_i }
33+
o.separator ''
34+
o.separator 'OPTIONAL OPTIONS:'
35+
o.on('-b', '--batch', 'submit jobs to Slurm', '(default is sequential jobs)') { |a| @args.use_batch = true }
36+
o.on('-d', '--dry-run', 'just print what would be done') { |a| @args.dry_run = true }
37+
o.on_tail('-h', '--help', 'show this message') do
38+
puts o
39+
exit
40+
end
41+
end.parse! ARGV.empty? ? ['--help'] : ARGV
42+
43+
# check required options
44+
if [@args.inputs, @args.output_dir, @args.prefix, @args.num_merge].include? nil
45+
raise 'missing required option(s;) re-run with "--help" for guidance.'
46+
end
47+
raise 'option "--num" must be greater than zero' unless @args.num_merge > 0
48+
49+
# glob inputs
50+
input_glob = File.expand_path @args.inputs
51+
input_glob = File.join input_glob, "*.hipo" if File.directory? input_glob
52+
print_log 'input glob', input_glob
53+
print_log 'output dir', @args.output_dir
54+
print_log 'output prefix', @args.prefix
55+
print_log 'num files per output', @args.num_merge
56+
57+
# chunks
58+
input_files = Dir.glob input_glob
59+
raise "no input files found with glob '#{input_glob}'" if input_files.empty?
60+
input_chunks = input_files.each_slice(@args.num_merge).to_a
61+
print_log 'num input files', input_files.size
62+
print_log 'num output files', input_chunks.size
63+
raise 'option "--num" >= num input files, therefore there is nothing to do' if input_chunks.size == 1
64+
65+
# build commands
66+
puts "="*82
67+
merge_cmds = input_chunks.each_with_index.map do |input_chunk, chunk_num|
68+
out_name = File.join @args.output_dir, "#{@args.prefix}_#{chunk_num.to_s.rjust(5, '0')}.hipo"
69+
raise "output file #{out_name} already exists; cannot overwrite! delete it or choose another path/name" if File.exist? out_name
70+
[ 'hipo-utils', '-merge', '-o', out_name, *input_chunk ].join ' '
71+
end
72+
73+
# sbatch commands
74+
if @args.use_batch
75+
sbatch_args = {
76+
'job-name' => "hipo_multi_merge___#{@args.prefix}",
77+
'account' => 'clas12',
78+
'partition' => 'production',
79+
'mem-per-cpu' => 500,
80+
'time' => '1:00:00',
81+
'ntasks' => 1,
82+
'cpus-per-task' => 1,
83+
}.map{ |opt, val| "--#{opt}=#{val.to_s}" }
84+
exe_cmds = merge_cmds.each_with_index.map do |merge_cmd, job_num|
85+
log_name = "/farm_out/%u/%x_#{job_num.to_s.rjust(5, '0')}"
86+
[
87+
'sbatch',
88+
*sbatch_args,
89+
"--output=#{log_name}.out",
90+
"--error=#{log_name}.err",
91+
"--wrap='#{merge_cmd}'",
92+
].join ' '
93+
end
94+
else
95+
exe_cmds = merge_cmds
96+
end
97+
98+
# execute
99+
if @args.dry_run
100+
puts 'THIS IS JUST A DRY RUN. Here are the commands which would be executed:'
101+
puts "="*82
102+
puts "mkdir -p #{@args.output_dir}"
103+
exe_cmds.each do |cmd| puts cmd end
104+
else
105+
FileUtils.mkdir_p @args.output_dir
106+
exe_cmds.each do |cmd| system cmd end
107+
end

0 commit comments

Comments
 (0)