v0.82: fix parsing bugs & improve parsing robustness

shockdude · shockdude · commit d37cec4fed33 · 2020-08-24T19:24:53.000-07:00
add chunkedogg_extract utility script to repo
diff --git a/README.md b/README.md
@@ -6,11 +6,12 @@ Uses pydub: https://github.com/jiaaro/pydub
 Usage: `python bms_to_rpp.py chart_file.bms [output_file.rpp]` \
 Or just drag-and-drop the chart onto `bms_to_rpp.py`
 
-Supports WAV keysounds. \
+Supports WAV (PCM) keysounds. \
 If your BMS does not include WAV keysounds, recommend converting them to WAV first. \
 OGG keysounds supported only if ffmpeg is installed, and processing will be very slow.
 
 Supports BPMs, extended BPMs, measure lengths/time signatures, and STOPs. \
 Negative BPMs untested. Other BMS features may not be implemented.
 
 Major props to the BMS command memo: http://hitkey.nekokan.dyndns.info/cmds.htm
+Major props to the DTX data format spec: https://ja.osdn.net/projects/dtxmania/wiki/DTX%2520data%2520format
diff --git a/bms_to_rpp.py b/bms_to_rpp.py
@@ -1,4 +1,4 @@
-# BMS to RPP v0.8
+# BMS to RPP v0.82
 # Copyright (C) 2020 shockdude
 # REAPER is property of Cockos Incorporated
 
@@ -23,7 +23,7 @@
 from pydub import AudioSegment
 
 def usage():
-	print("BMS to RPP v0.8")
+	print("BMS to RPP v0.82")
 	print("Convert a BMS or DTX chart into a playable REAPER project")
 	print("WAV keysounds recommended, OGG keysounds require ffmpeg/avconv and are slow to parse.")
 	print("Usage: {} chart_file.bms [output_filename.rpp]".format(sys.argv[0]))
@@ -113,73 +113,62 @@ def find_tag(line, tag):
 		return line[len(tag):]
 	return None
 
+# parse header value
+def get_header_value(line, header):
+	header_re = re.compile("#{}([\\w\\d][\\w\\d])(:\\s*|\\s+)(.+)\\s*".format(header))
+	re_match = header_re.match(line)
+	if re_match != None and re_match.start() == 0:
+		index = re_match.group(1)
+		value = re_match.group(3)
+		return index, value
+	return None, None
+
 # create dictionary of keysounds
 def add_keysound(line):
-	wav_re = re.compile("#WAV[\\w\\d][\\w\\d]")
-	re_match = wav_re.match(line)
-	if re_match != None and re_match.start() == 0:
-		line_split = line.split(" ")
-		keysound_index = line_split[0][-2:]
-		keysound_origname = " ".join(line_split[1:]).strip()
-		# look for wav or ogg, even if the original chart uses a different format
-		keysound_basename = os.path.splitext(keysound_origname)[0]
+	index, value = get_header_value(line, "WAV")
+	if index != None and value != None:
+		keysound_basename = os.path.splitext(value)[0]
 		keysound_filename = keysound_basename + WAV_EXT
 		if os.path.isfile(keysound_filename):
-			keysound_dict[keysound_index] = keysound_filename
+			keysound_dict[index] = keysound_filename
 			return True
 		keysound_filename = keysound_basename + OGG_EXT
 		if os.path.isfile(keysound_filename):
-			keysound_dict[keysound_index] = keysound_filename
+			keysound_dict[index] = keysound_filename
 			return True
 		print("Error: could not find .wav or .ogg for {}".format(keysound_origname))
 		usage()
 	return False
 
 # create dictionary of keysound volume percentages
 def add_keysoundvolume(line):
-	bpm_re = re.compile("#VOLUME[\\w\\d][\\w\\d]")
-	re_match = bpm_re.match(line)
-	if re_match != None and re_match.start() == 0:
-		line_split = line.split(" ")
-		vol_index = line_split[0][-2:]
-		vol = float(line_split[1].strip()) / 100.0
-		keysoundvol_dict[vol_index] = vol
+	index, value = get_header_value(line, "VOLUME")
+	if index != None and value != None:
+		keysoundvol_dict[index] = float(value) / 100.0
 		return True
 	return False
 
 # create dictionary of keysound pan percentages
 def add_keysoundpan(line):
-	bpm_re = re.compile("#PAN[\\w\\d][\\w\\d]")
-	re_match = bpm_re.match(line)
-	if re_match != None and re_match.start() == 0:
-		line_split = line.split(" ")
-		pan_index = line_split[0][-2:]
-		pan = float(line_split[1].strip()) / 100.0
-		keysoundpan_dict[pan_index] = pan
+	index, value = get_header_value(line, "PAN")
+	if index != None and value != None:
+		keysoundpan_dict[index] = float(value) / 100.0
 		return True
 	return False
 
 # create dictionary of extended bpm values
 def add_bpmvalue(line):
-	bpm_re = re.compile("#BPM[\\w\\d][\\w\\d]")
-	re_match = bpm_re.match(line)
-	if re_match != None and re_match.start() == 0:
-		line_split = line.split(" ")
-		bpmvalue_index = line_split[0][-2:]
-		bpmvalue = float(line_split[1].strip())
-		extbpm_dict[bpmvalue_index] = bpmvalue
+	index, value = get_header_value(line, "BPM")
+	if index != None and value != None:
+		extbpm_dict[index] = float(value)
 		return True
 	return False
 		
 # create dictionary of stop values
 def add_stopvalue(line):
-	bpm_re = re.compile("#STOP[\\w\\d][\\w\\d]")
-	re_match = bpm_re.match(line)
-	if re_match != None and re_match.start() == 0:
-		line_split = line.split(" ")
-		stopvalue_index = line_split[0][-2:]
-		stopvalue = float(line_split[1].strip())
-		stop_dict[stopvalue_index] = stopvalue
+	index, value = get_header_value(line, "STOP")
+	if index != None and value != None:
+		stop_dict[index] = float(value)
 		return True
 	return False
 
@@ -222,27 +211,26 @@ def update_data(data1, data2):
 def add_channel(line):
 	global max_measure
 	# use regular expression to match the channel format
-	note_re = re.compile("#\\d\\d\\d\\d\\d:")
+	note_re = re.compile("#(\\d\\d\\d[\\d\\w][\\d\\w])(:\\s*|\\s+)(\\S+)")
 	re_match = note_re.match(line)
 	if re_match != None and re_match.start() == 0:
-		line_split = line.split(":")
-		header = line_split[0][1:]
+		header = re_match.group(1)
 		measure = int(header[0:3])
 		channel = header[3:5]
-		data = line_split[1].strip()
-		data_array = data_to_array(data)
+		data = re_match.group(3)
 		
 		# set the largest measure found
 		if measure > max_measure:
 			max_measure = measure
 		
-		# channel with data array
 		if parsing_mode == MODE_BMS:
 			playable_channels = BMS_PLAYABLE_CHANNELS
 		elif parsing_mode == MODE_DTX:
 			playable_channels = DTX_PLAYABLE_CHANNELS
-		
+
+		# check for channel with data array
 		if channel in (playable_channels + (BPM_CHANNEL, EXTBPM_CHANNEL, STOP_CHANNEL)) and data != "00":
+			data_array = data_to_array(data)
 			if channel == "01":
 				# bgm tracks are special and shouldn't be merged
 				# dictionary maps to array of arrays instead
diff --git a/chunkedogg_extract.py b/chunkedogg_extract.py
@@ -0,0 +1,83 @@
+# Chunked OGG Extractor v0.1
+# Copyright (C) 2020 shockdude
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import sys
+import os
+import time
+import struct
+
+OGG_EXT = ".ogg"
+OGG_MAGIC = b"OggS"
+DATA_MAGIC = b"data"
+
+def usage():
+	print("Chunked OGG extractor v0.1")
+	print('Get a playable OGG out of a "chunked vorbis" WAV')
+	print("Usage: {} file.wav [out.ogg]".format(sys.argv[0]))
+	time.sleep(3)
+	sys.exit(1)
+
+def find_ogg(in_filename, out_filename):
+	print("Writing ogg {} from {}".format(out_filename, in_filename))
+	with open(in_filename, "rb") as in_file:
+		with open(out_filename, "wb") as out_file:
+			buf = in_file.read(4)
+			while len(buf) == 4:
+				if buf != OGG_MAGIC:
+					# look for the OggS magic keyword
+					buf = buf[1:] + in_file.read(1)
+				else: # found OggS
+					page_data = buf
+					page_data += in_file.read(2) # stream structure, header type flag
+					page_data += in_file.read(8) # absolute granule position
+					stream_serial_number = in_file.read(4)
+					page_data += stream_serial_number
+					page_data += in_file.read(8) # page sequence number, page checksum
+					
+					# count number of segments in oggs page
+					num_segments_byte = in_file.read(1)
+					page_data += num_segments_byte
+					num_segments = int.from_bytes(num_segments_byte, "little")
+					
+					# count lengths of segments in oggs page
+					total_segments_length = 0
+					for i in range(num_segments):
+						segment_length_byte = in_file.read(1)
+						page_data += segment_length_byte
+						total_segments_length += int.from_bytes(segment_length_byte, "little")
+				
+					page_data += in_file.read(total_segments_length)
+					# skip page if the serial number is all Fs
+					if stream_serial_number != b"\xff\xff\xff\xff":
+						out_file.write(page_data)
+					
+					# move through the loop again
+					buf = in_file.read(4)
+
+def main():
+	if len(sys.argv) < 2:
+		usage()
+	else:
+		in_filename = sys.argv[1]
+		in_file, in_ext = os.path.splitext(in_filename)
+		if len(sys.argv) > 2:
+			out_filename = sys.argv[2]
+		else:
+			out_filename = os.path.splitext(os.path.basename(in_file))[0] + OGG_EXT
+		find_ogg(in_filename, out_filename)
+		
+if __name__ == "__main__":
+	main()